1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
5 ; FIXME: Broken on evergreen
6 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7 ; individual elements instead of 128-bit stores.
9 define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind {
10 ; SI-LABEL: insertelement_v2f32_0:
12 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
13 ; SI-NEXT: s_mov_b32 s7, 0x100f000
14 ; SI-NEXT: s_mov_b32 s6, -1
15 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b32 s4, s0
18 ; SI-NEXT: s_mov_b32 s5, s1
19 ; SI-NEXT: v_mov_b32_e32 v1, s3
20 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
23 ; VI-LABEL: insertelement_v2f32_0:
25 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
26 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
27 ; VI-NEXT: s_mov_b32 s6, -1
28 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: s_mov_b32 s4, s0
31 ; VI-NEXT: s_mov_b32 s5, s1
32 ; VI-NEXT: v_mov_b32_e32 v1, s3
33 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
35 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
36 store <2 x float> %vecins, ptr addrspace(1) %out, align 16
40 define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind {
41 ; SI-LABEL: insertelement_v2f32_1:
43 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
44 ; SI-NEXT: s_mov_b32 s7, 0x100f000
45 ; SI-NEXT: s_mov_b32 s6, -1
46 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
47 ; SI-NEXT: s_waitcnt lgkmcnt(0)
48 ; SI-NEXT: s_mov_b32 s4, s0
49 ; SI-NEXT: s_mov_b32 s5, s1
50 ; SI-NEXT: v_mov_b32_e32 v0, s2
51 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
54 ; VI-LABEL: insertelement_v2f32_1:
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
57 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
58 ; VI-NEXT: s_mov_b32 s6, -1
59 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
60 ; VI-NEXT: s_waitcnt lgkmcnt(0)
61 ; VI-NEXT: s_mov_b32 s4, s0
62 ; VI-NEXT: s_mov_b32 s5, s1
63 ; VI-NEXT: v_mov_b32_e32 v0, s2
64 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
66 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
67 store <2 x float> %vecins, ptr addrspace(1) %out, align 16
71 define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
72 ; SI-LABEL: insertelement_v2i32_0:
74 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
75 ; SI-NEXT: s_mov_b32 s7, 0x100f000
76 ; SI-NEXT: s_mov_b32 s6, -1
77 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
78 ; SI-NEXT: s_waitcnt lgkmcnt(0)
79 ; SI-NEXT: s_mov_b32 s4, s0
80 ; SI-NEXT: s_mov_b32 s5, s1
81 ; SI-NEXT: v_mov_b32_e32 v1, s3
82 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
85 ; VI-LABEL: insertelement_v2i32_0:
87 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
88 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
89 ; VI-NEXT: s_mov_b32 s6, -1
90 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7
91 ; VI-NEXT: s_waitcnt lgkmcnt(0)
92 ; VI-NEXT: s_mov_b32 s4, s0
93 ; VI-NEXT: s_mov_b32 s5, s1
94 ; VI-NEXT: v_mov_b32_e32 v1, s3
95 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
97 %vecins = insertelement <2 x i32> %a, i32 999, i32 0
98 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
102 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
103 ; SI-LABEL: insertelement_v2i32_1:
105 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
106 ; SI-NEXT: s_mov_b32 s7, 0x100f000
107 ; SI-NEXT: s_mov_b32 s6, -1
108 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7
109 ; SI-NEXT: s_waitcnt lgkmcnt(0)
110 ; SI-NEXT: s_mov_b32 s4, s0
111 ; SI-NEXT: s_mov_b32 s5, s1
112 ; SI-NEXT: v_mov_b32_e32 v0, s2
113 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
116 ; VI-LABEL: insertelement_v2i32_1:
118 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
119 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
120 ; VI-NEXT: s_mov_b32 s6, -1
121 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7
122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
123 ; VI-NEXT: s_mov_b32 s4, s0
124 ; VI-NEXT: s_mov_b32 s5, s1
125 ; VI-NEXT: v_mov_b32_e32 v0, s2
126 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
128 %vecins = insertelement <2 x i32> %a, i32 999, i32 1
129 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
133 ; FIXME: Why is the constant moved into the intermediate register and
134 ; not just directly into the vector component?
135 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind {
136 ; SI-LABEL: insertelement_v4f32_0:
138 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
139 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
140 ; SI-NEXT: s_waitcnt lgkmcnt(0)
141 ; SI-NEXT: s_mov_b32 s0, 0x40a00000
142 ; SI-NEXT: s_mov_b32 s7, 0x100f000
143 ; SI-NEXT: s_mov_b32 s6, -1
144 ; SI-NEXT: v_mov_b32_e32 v0, s0
145 ; SI-NEXT: v_mov_b32_e32 v1, s1
146 ; SI-NEXT: v_mov_b32_e32 v2, s2
147 ; SI-NEXT: v_mov_b32_e32 v3, s3
148 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
151 ; VI-LABEL: insertelement_v4f32_0:
153 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
154 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
155 ; VI-NEXT: s_waitcnt lgkmcnt(0)
156 ; VI-NEXT: s_mov_b32 s0, 0x40a00000
157 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
158 ; VI-NEXT: s_mov_b32 s6, -1
159 ; VI-NEXT: v_mov_b32_e32 v0, s0
160 ; VI-NEXT: v_mov_b32_e32 v1, s1
161 ; VI-NEXT: v_mov_b32_e32 v2, s2
162 ; VI-NEXT: v_mov_b32_e32 v3, s3
163 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
165 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
166 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
170 define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind {
171 ; SI-LABEL: insertelement_v4f32_1:
173 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
174 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
175 ; SI-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NEXT: s_mov_b32 s1, 0x40a00000
177 ; SI-NEXT: s_mov_b32 s7, 0x100f000
178 ; SI-NEXT: s_mov_b32 s6, -1
179 ; SI-NEXT: v_mov_b32_e32 v0, s0
180 ; SI-NEXT: v_mov_b32_e32 v1, s1
181 ; SI-NEXT: v_mov_b32_e32 v2, s2
182 ; SI-NEXT: v_mov_b32_e32 v3, s3
183 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
186 ; VI-LABEL: insertelement_v4f32_1:
188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
189 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
191 ; VI-NEXT: s_mov_b32 s1, 0x40a00000
192 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
193 ; VI-NEXT: s_mov_b32 s6, -1
194 ; VI-NEXT: v_mov_b32_e32 v0, s0
195 ; VI-NEXT: v_mov_b32_e32 v1, s1
196 ; VI-NEXT: v_mov_b32_e32 v2, s2
197 ; VI-NEXT: v_mov_b32_e32 v3, s3
198 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
200 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
201 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
205 define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind {
206 ; SI-LABEL: insertelement_v4f32_2:
208 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
209 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
210 ; SI-NEXT: s_waitcnt lgkmcnt(0)
211 ; SI-NEXT: s_mov_b32 s2, 0x40a00000
212 ; SI-NEXT: s_mov_b32 s7, 0x100f000
213 ; SI-NEXT: s_mov_b32 s6, -1
214 ; SI-NEXT: v_mov_b32_e32 v0, s0
215 ; SI-NEXT: v_mov_b32_e32 v1, s1
216 ; SI-NEXT: v_mov_b32_e32 v2, s2
217 ; SI-NEXT: v_mov_b32_e32 v3, s3
218 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
221 ; VI-LABEL: insertelement_v4f32_2:
223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
224 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
226 ; VI-NEXT: s_mov_b32 s2, 0x40a00000
227 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
228 ; VI-NEXT: s_mov_b32 s6, -1
229 ; VI-NEXT: v_mov_b32_e32 v0, s0
230 ; VI-NEXT: v_mov_b32_e32 v1, s1
231 ; VI-NEXT: v_mov_b32_e32 v2, s2
232 ; VI-NEXT: v_mov_b32_e32 v3, s3
233 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
235 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
236 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
240 define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind {
241 ; SI-LABEL: insertelement_v4f32_3:
243 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
244 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
246 ; SI-NEXT: s_mov_b32 s3, 0x40a00000
247 ; SI-NEXT: s_mov_b32 s7, 0x100f000
248 ; SI-NEXT: s_mov_b32 s6, -1
249 ; SI-NEXT: v_mov_b32_e32 v0, s0
250 ; SI-NEXT: v_mov_b32_e32 v1, s1
251 ; SI-NEXT: v_mov_b32_e32 v2, s2
252 ; SI-NEXT: v_mov_b32_e32 v3, s3
253 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
256 ; VI-LABEL: insertelement_v4f32_3:
258 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
259 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
260 ; VI-NEXT: s_waitcnt lgkmcnt(0)
261 ; VI-NEXT: s_mov_b32 s3, 0x40a00000
262 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
263 ; VI-NEXT: s_mov_b32 s6, -1
264 ; VI-NEXT: v_mov_b32_e32 v0, s0
265 ; VI-NEXT: v_mov_b32_e32 v1, s1
266 ; VI-NEXT: v_mov_b32_e32 v2, s2
267 ; VI-NEXT: v_mov_b32_e32 v3, s3
268 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
270 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
271 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
275 define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind {
276 ; SI-LABEL: insertelement_v4i32_0:
278 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
279 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
280 ; SI-NEXT: s_waitcnt lgkmcnt(0)
281 ; SI-NEXT: s_movk_i32 s0, 0x3e7
282 ; SI-NEXT: s_mov_b32 s7, 0x100f000
283 ; SI-NEXT: s_mov_b32 s6, -1
284 ; SI-NEXT: v_mov_b32_e32 v0, s0
285 ; SI-NEXT: v_mov_b32_e32 v1, s1
286 ; SI-NEXT: v_mov_b32_e32 v2, s2
287 ; SI-NEXT: v_mov_b32_e32 v3, s3
288 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
291 ; VI-LABEL: insertelement_v4i32_0:
293 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
294 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
296 ; VI-NEXT: s_movk_i32 s0, 0x3e7
297 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
298 ; VI-NEXT: s_mov_b32 s6, -1
299 ; VI-NEXT: v_mov_b32_e32 v0, s0
300 ; VI-NEXT: v_mov_b32_e32 v1, s1
301 ; VI-NEXT: v_mov_b32_e32 v2, s2
302 ; VI-NEXT: v_mov_b32_e32 v3, s3
303 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
305 %vecins = insertelement <4 x i32> %a, i32 999, i32 0
306 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
310 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind {
311 ; SI-LABEL: insertelement_v3f32_1:
313 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
314 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
315 ; SI-NEXT: s_mov_b32 s7, 0x100f000
316 ; SI-NEXT: s_mov_b32 s6, -1
317 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
319 ; SI-NEXT: v_mov_b32_e32 v0, s0
320 ; SI-NEXT: v_mov_b32_e32 v2, s2
321 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
324 ; VI-LABEL: insertelement_v3f32_1:
326 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
327 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
328 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
329 ; VI-NEXT: s_mov_b32 s6, -1
330 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
331 ; VI-NEXT: s_waitcnt lgkmcnt(0)
332 ; VI-NEXT: v_mov_b32_e32 v0, s0
333 ; VI-NEXT: v_mov_b32_e32 v2, s2
334 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
336 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
337 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
341 define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind {
342 ; SI-LABEL: insertelement_v3f32_2:
344 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
345 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
346 ; SI-NEXT: s_mov_b32 s7, 0x100f000
347 ; SI-NEXT: s_mov_b32 s6, -1
348 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
349 ; SI-NEXT: s_waitcnt lgkmcnt(0)
350 ; SI-NEXT: v_mov_b32_e32 v0, s0
351 ; SI-NEXT: v_mov_b32_e32 v1, s1
352 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
355 ; VI-LABEL: insertelement_v3f32_2:
357 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
358 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
359 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
360 ; VI-NEXT: s_mov_b32 s6, -1
361 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
362 ; VI-NEXT: s_waitcnt lgkmcnt(0)
363 ; VI-NEXT: v_mov_b32_e32 v0, s0
364 ; VI-NEXT: v_mov_b32_e32 v1, s1
365 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
367 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
368 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
372 define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind {
373 ; GCN-LABEL: insertelement_v3f32_3:
376 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
377 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
381 define <4 x float> @insertelement_to_sgpr() nounwind {
382 ; GCN-LABEL: insertelement_to_sgpr:
384 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
386 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
387 ; GCN-NEXT: s_mov_b32 s4, 0
388 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
389 ; GCN-NEXT: s_waitcnt vmcnt(0)
390 ; GCN-NEXT: s_setpc_b64 s[30:31]
391 %tmp = load <4 x i32>, ptr addrspace(4) undef
392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394 ret <4 x float> %tmp2
397 define <9 x float> @insertelement_to_v9f32_undef() nounwind {
398 ; GCN-LABEL: insertelement_to_v9f32_undef:
400 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
402 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000
403 ; GCN-NEXT: v_mov_b32_e32 v2, 0xc0a00000
404 ; GCN-NEXT: v_mov_b32_e32 v7, 0x41880000
405 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
406 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
407 ; GCN-NEXT: v_mov_b32_e32 v1, s5
408 ; GCN-NEXT: v_mov_b32_e32 v3, s7
409 ; GCN-NEXT: v_mov_b32_e32 v4, s8
410 ; GCN-NEXT: v_mov_b32_e32 v5, s9
411 ; GCN-NEXT: v_mov_b32_e32 v6, s10
412 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
413 ; GCN-NEXT: v_mov_b32_e32 v8, s4
414 ; GCN-NEXT: s_setpc_b64 s[30:31]
415 %tmp = load <9 x float>, ptr addrspace(4) undef
416 %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
417 %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2
418 %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7
419 ret <9 x float> %tmp3
422 define <10 x float> @insertelement_to_v10f32_undef() nounwind {
423 ; GCN-LABEL: insertelement_to_v10f32_undef:
425 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
427 ; GCN-NEXT: v_mov_b32_e32 v0, 2.0
428 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
429 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
430 ; GCN-NEXT: v_mov_b32_e32 v1, s5
431 ; GCN-NEXT: v_mov_b32_e32 v2, s6
432 ; GCN-NEXT: v_mov_b32_e32 v3, s7
433 ; GCN-NEXT: v_mov_b32_e32 v4, s8
434 ; GCN-NEXT: v_mov_b32_e32 v5, s9
435 ; GCN-NEXT: v_mov_b32_e32 v6, s10
436 ; GCN-NEXT: v_mov_b32_e32 v7, s11
437 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
438 ; GCN-NEXT: v_mov_b32_e32 v8, s12
439 ; GCN-NEXT: v_mov_b32_e32 v9, s13
440 ; GCN-NEXT: s_setpc_b64 s[30:31]
441 %tmp = load <10 x float>, ptr addrspace(4) undef
442 %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
443 ret <10 x float> %tmp1
446 define <11 x float> @insertelement_to_v11f32_undef() nounwind {
447 ; GCN-LABEL: insertelement_to_v11f32_undef:
449 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
451 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0
452 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
453 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
454 ; GCN-NEXT: v_mov_b32_e32 v1, s5
455 ; GCN-NEXT: v_mov_b32_e32 v2, s6
456 ; GCN-NEXT: v_mov_b32_e32 v3, s7
457 ; GCN-NEXT: v_mov_b32_e32 v4, s8
458 ; GCN-NEXT: v_mov_b32_e32 v5, s9
459 ; GCN-NEXT: v_mov_b32_e32 v6, s10
460 ; GCN-NEXT: v_mov_b32_e32 v7, s11
461 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
462 ; GCN-NEXT: v_mov_b32_e32 v8, s12
463 ; GCN-NEXT: v_mov_b32_e32 v9, s13
464 ; GCN-NEXT: v_mov_b32_e32 v10, s14
465 ; GCN-NEXT: s_setpc_b64 s[30:31]
466 %tmp = load <11 x float>, ptr addrspace(4) undef
467 %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
468 ret <11 x float> %tmp1
471 define <12 x float> @insertelement_to_v12f32_undef() nounwind {
472 ; GCN-LABEL: insertelement_to_v12f32_undef:
474 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
476 ; GCN-NEXT: v_mov_b32_e32 v0, 4.0
477 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
478 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
479 ; GCN-NEXT: v_mov_b32_e32 v1, s5
480 ; GCN-NEXT: v_mov_b32_e32 v2, s6
481 ; GCN-NEXT: v_mov_b32_e32 v3, s7
482 ; GCN-NEXT: v_mov_b32_e32 v4, s8
483 ; GCN-NEXT: v_mov_b32_e32 v5, s9
484 ; GCN-NEXT: v_mov_b32_e32 v6, s10
485 ; GCN-NEXT: v_mov_b32_e32 v7, s11
486 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
487 ; GCN-NEXT: v_mov_b32_e32 v8, s12
488 ; GCN-NEXT: v_mov_b32_e32 v9, s13
489 ; GCN-NEXT: v_mov_b32_e32 v10, s14
490 ; GCN-NEXT: v_mov_b32_e32 v11, s15
491 ; GCN-NEXT: s_setpc_b64 s[30:31]
492 %tmp = load <12 x float>, ptr addrspace(4) undef
493 %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
494 ret <12 x float> %tmp1
497 define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
498 ; SI-LABEL: dynamic_insertelement_v2f32:
500 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
501 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
502 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
503 ; SI-NEXT: s_mov_b32 s7, 0x100f000
504 ; SI-NEXT: s_mov_b32 s6, -1
505 ; SI-NEXT: s_waitcnt lgkmcnt(0)
506 ; SI-NEXT: s_cmp_lg_u32 s2, 1
507 ; SI-NEXT: v_mov_b32_e32 v1, s1
508 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
509 ; SI-NEXT: s_cmp_lg_u32 s2, 0
510 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
511 ; SI-NEXT: v_mov_b32_e32 v2, s0
512 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
513 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
514 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
517 ; VI-LABEL: dynamic_insertelement_v2f32:
519 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
520 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
521 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
522 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
523 ; VI-NEXT: s_mov_b32 s6, -1
524 ; VI-NEXT: s_waitcnt lgkmcnt(0)
525 ; VI-NEXT: s_cmp_lg_u32 s2, 1
526 ; VI-NEXT: v_mov_b32_e32 v1, s1
527 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
528 ; VI-NEXT: s_cmp_lg_u32 s2, 0
529 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
530 ; VI-NEXT: v_mov_b32_e32 v2, s0
531 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
532 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
533 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
535 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
536 store <2 x float> %vecins, ptr addrspace(1) %out, align 8
540 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind {
541 ; SI-LABEL: dynamic_insertelement_v3f32:
543 ; SI-NEXT: s_load_dword s8, s[4:5], 0x8
544 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
545 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
546 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
547 ; SI-NEXT: s_mov_b32 s3, 0x100f000
548 ; SI-NEXT: s_waitcnt lgkmcnt(0)
549 ; SI-NEXT: s_cmp_lg_u32 s8, 2
550 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
551 ; SI-NEXT: v_mov_b32_e32 v1, s6
552 ; SI-NEXT: s_cmp_lg_u32 s8, 1
553 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
554 ; SI-NEXT: v_mov_b32_e32 v1, s5
555 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
556 ; SI-NEXT: s_cmp_lg_u32 s8, 0
557 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
558 ; SI-NEXT: v_mov_b32_e32 v3, s4
559 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
560 ; SI-NEXT: s_mov_b32 s2, -1
561 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
562 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
565 ; VI-LABEL: dynamic_insertelement_v3f32:
567 ; VI-NEXT: s_load_dword s8, s[4:5], 0x20
568 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
569 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
570 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
571 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
573 ; VI-NEXT: s_cmp_lg_u32 s8, 2
574 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
575 ; VI-NEXT: v_mov_b32_e32 v1, s6
576 ; VI-NEXT: s_cmp_lg_u32 s8, 1
577 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
578 ; VI-NEXT: v_mov_b32_e32 v1, s5
579 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
580 ; VI-NEXT: s_cmp_lg_u32 s8, 0
581 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
582 ; VI-NEXT: v_mov_b32_e32 v3, s4
583 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
584 ; VI-NEXT: s_mov_b32 s2, -1
585 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
586 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
588 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
589 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
593 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind {
594 ; SI-LABEL: dynamic_insertelement_v4f32:
596 ; SI-NEXT: s_load_dword s8, s[4:5], 0x8
597 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
598 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
599 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
600 ; SI-NEXT: s_mov_b32 s3, 0x100f000
601 ; SI-NEXT: s_waitcnt lgkmcnt(0)
602 ; SI-NEXT: s_cmp_lg_u32 s8, 3
603 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
604 ; SI-NEXT: v_mov_b32_e32 v1, s7
605 ; SI-NEXT: s_cmp_lg_u32 s8, 2
606 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
607 ; SI-NEXT: v_mov_b32_e32 v1, s6
608 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
609 ; SI-NEXT: s_cmp_lg_u32 s8, 1
610 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
611 ; SI-NEXT: v_mov_b32_e32 v1, s5
612 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
613 ; SI-NEXT: s_cmp_lg_u32 s8, 0
614 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
615 ; SI-NEXT: v_mov_b32_e32 v4, s4
616 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
617 ; SI-NEXT: s_mov_b32 s2, -1
618 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
619 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
622 ; VI-LABEL: dynamic_insertelement_v4f32:
624 ; VI-NEXT: s_load_dword s8, s[4:5], 0x20
625 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
626 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
627 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
628 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
629 ; VI-NEXT: s_waitcnt lgkmcnt(0)
630 ; VI-NEXT: s_cmp_lg_u32 s8, 3
631 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
632 ; VI-NEXT: v_mov_b32_e32 v1, s7
633 ; VI-NEXT: s_cmp_lg_u32 s8, 2
634 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
635 ; VI-NEXT: v_mov_b32_e32 v1, s6
636 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
637 ; VI-NEXT: s_cmp_lg_u32 s8, 1
638 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
639 ; VI-NEXT: v_mov_b32_e32 v1, s5
640 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
641 ; VI-NEXT: s_cmp_lg_u32 s8, 0
642 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
643 ; VI-NEXT: v_mov_b32_e32 v4, s4
644 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
645 ; VI-NEXT: s_mov_b32 s2, -1
646 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
647 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
649 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
650 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
654 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
655 ; SI-LABEL: dynamic_insertelement_v8f32:
657 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
658 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
659 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
660 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000
661 ; SI-NEXT: s_mov_b32 s3, 0x100f000
662 ; SI-NEXT: s_mov_b32 s2, -1
663 ; SI-NEXT: s_waitcnt lgkmcnt(0)
664 ; SI-NEXT: v_mov_b32_e32 v0, s8
665 ; SI-NEXT: v_mov_b32_e32 v1, s9
666 ; SI-NEXT: v_mov_b32_e32 v2, s10
667 ; SI-NEXT: v_mov_b32_e32 v3, s11
668 ; SI-NEXT: v_mov_b32_e32 v4, s12
669 ; SI-NEXT: v_mov_b32_e32 v5, s13
670 ; SI-NEXT: v_mov_b32_e32 v6, s14
671 ; SI-NEXT: v_mov_b32_e32 v7, s15
672 ; SI-NEXT: s_mov_b32 m0, s4
673 ; SI-NEXT: v_movreld_b32_e32 v0, v8
674 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
675 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
678 ; VI-LABEL: dynamic_insertelement_v8f32:
680 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
681 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
682 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
683 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000
684 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
685 ; VI-NEXT: s_mov_b32 s2, -1
686 ; VI-NEXT: s_waitcnt lgkmcnt(0)
687 ; VI-NEXT: v_mov_b32_e32 v0, s8
688 ; VI-NEXT: v_mov_b32_e32 v1, s9
689 ; VI-NEXT: v_mov_b32_e32 v2, s10
690 ; VI-NEXT: v_mov_b32_e32 v3, s11
691 ; VI-NEXT: v_mov_b32_e32 v4, s12
692 ; VI-NEXT: v_mov_b32_e32 v5, s13
693 ; VI-NEXT: v_mov_b32_e32 v6, s14
694 ; VI-NEXT: v_mov_b32_e32 v7, s15
695 ; VI-NEXT: s_mov_b32 m0, s4
696 ; VI-NEXT: v_movreld_b32_e32 v0, v8
697 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
698 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
700 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
701 store <8 x float> %vecins, ptr addrspace(1) %out, align 32
705 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
706 ; SI-LABEL: dynamic_insertelement_v9f32:
708 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
709 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
710 ; SI-NEXT: s_load_dword s6, s[4:5], 0x18
711 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
712 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000
713 ; SI-NEXT: s_mov_b32 s3, 0x100f000
714 ; SI-NEXT: s_waitcnt lgkmcnt(0)
715 ; SI-NEXT: v_mov_b32_e32 v0, s8
716 ; SI-NEXT: v_mov_b32_e32 v1, s9
717 ; SI-NEXT: v_mov_b32_e32 v2, s10
718 ; SI-NEXT: v_mov_b32_e32 v3, s11
719 ; SI-NEXT: v_mov_b32_e32 v4, s12
720 ; SI-NEXT: v_mov_b32_e32 v5, s13
721 ; SI-NEXT: v_mov_b32_e32 v6, s14
722 ; SI-NEXT: v_mov_b32_e32 v7, s15
723 ; SI-NEXT: v_mov_b32_e32 v8, s6
724 ; SI-NEXT: s_mov_b32 m0, s4
725 ; SI-NEXT: s_mov_b32 s2, -1
726 ; SI-NEXT: v_movreld_b32_e32 v0, v9
727 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
728 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
729 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
732 ; VI-LABEL: dynamic_insertelement_v9f32:
734 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
735 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
736 ; VI-NEXT: s_load_dword s6, s[4:5], 0x60
737 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
738 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000
739 ; VI-NEXT: s_waitcnt lgkmcnt(0)
740 ; VI-NEXT: v_mov_b32_e32 v0, s8
741 ; VI-NEXT: v_mov_b32_e32 v1, s9
742 ; VI-NEXT: v_mov_b32_e32 v2, s10
743 ; VI-NEXT: v_mov_b32_e32 v3, s11
744 ; VI-NEXT: v_mov_b32_e32 v4, s12
745 ; VI-NEXT: v_mov_b32_e32 v5, s13
746 ; VI-NEXT: v_mov_b32_e32 v6, s14
747 ; VI-NEXT: v_mov_b32_e32 v7, s15
748 ; VI-NEXT: v_mov_b32_e32 v8, s6
749 ; VI-NEXT: s_mov_b32 m0, s4
750 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
751 ; VI-NEXT: s_mov_b32 s2, -1
752 ; VI-NEXT: v_movreld_b32_e32 v0, v9
753 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
754 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
755 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
757 %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
758 store <9 x float> %vecins, ptr addrspace(1) %out, align 32
762 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
763 ; SI-LABEL: dynamic_insertelement_v10f32:
765 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
766 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
767 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
768 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
769 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000
770 ; SI-NEXT: s_mov_b32 s3, 0x100f000
771 ; SI-NEXT: s_waitcnt lgkmcnt(0)
772 ; SI-NEXT: v_mov_b32_e32 v0, s8
773 ; SI-NEXT: v_mov_b32_e32 v1, s9
774 ; SI-NEXT: v_mov_b32_e32 v2, s10
775 ; SI-NEXT: v_mov_b32_e32 v3, s11
776 ; SI-NEXT: v_mov_b32_e32 v4, s12
777 ; SI-NEXT: v_mov_b32_e32 v5, s13
778 ; SI-NEXT: v_mov_b32_e32 v6, s14
779 ; SI-NEXT: v_mov_b32_e32 v7, s15
780 ; SI-NEXT: v_mov_b32_e32 v8, s6
781 ; SI-NEXT: v_mov_b32_e32 v9, s7
782 ; SI-NEXT: s_mov_b32 m0, s4
783 ; SI-NEXT: s_mov_b32 s2, -1
784 ; SI-NEXT: v_movreld_b32_e32 v0, v10
785 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
786 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
787 ; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
790 ; VI-LABEL: dynamic_insertelement_v10f32:
792 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
793 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
794 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60
795 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
796 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000
797 ; VI-NEXT: s_waitcnt lgkmcnt(0)
798 ; VI-NEXT: v_mov_b32_e32 v0, s8
799 ; VI-NEXT: v_mov_b32_e32 v1, s9
800 ; VI-NEXT: v_mov_b32_e32 v2, s10
801 ; VI-NEXT: v_mov_b32_e32 v3, s11
802 ; VI-NEXT: v_mov_b32_e32 v4, s12
803 ; VI-NEXT: v_mov_b32_e32 v5, s13
804 ; VI-NEXT: v_mov_b32_e32 v6, s14
805 ; VI-NEXT: v_mov_b32_e32 v7, s15
806 ; VI-NEXT: v_mov_b32_e32 v8, s6
807 ; VI-NEXT: v_mov_b32_e32 v9, s7
808 ; VI-NEXT: s_mov_b32 m0, s4
809 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
810 ; VI-NEXT: s_mov_b32 s2, -1
811 ; VI-NEXT: v_movreld_b32_e32 v0, v10
812 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
813 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
814 ; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
816 %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b
817 store <10 x float> %vecins, ptr addrspace(1) %out, align 32
821 define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind {
822 ; SI-LABEL: dynamic_insertelement_v11f32:
824 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
825 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
826 ; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
827 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
828 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000
829 ; SI-NEXT: s_mov_b32 s3, 0x100f000
830 ; SI-NEXT: s_waitcnt lgkmcnt(0)
831 ; SI-NEXT: v_mov_b32_e32 v0, s8
832 ; SI-NEXT: v_mov_b32_e32 v1, s9
833 ; SI-NEXT: v_mov_b32_e32 v2, s10
834 ; SI-NEXT: v_mov_b32_e32 v3, s11
835 ; SI-NEXT: v_mov_b32_e32 v4, s12
836 ; SI-NEXT: v_mov_b32_e32 v5, s13
837 ; SI-NEXT: v_mov_b32_e32 v6, s14
838 ; SI-NEXT: v_mov_b32_e32 v7, s15
839 ; SI-NEXT: v_mov_b32_e32 v8, s16
840 ; SI-NEXT: v_mov_b32_e32 v9, s17
841 ; SI-NEXT: v_mov_b32_e32 v10, s18
842 ; SI-NEXT: s_mov_b32 m0, s4
843 ; SI-NEXT: s_mov_b32 s2, -1
844 ; SI-NEXT: v_movreld_b32_e32 v0, v11
845 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
846 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
847 ; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
850 ; VI-LABEL: dynamic_insertelement_v11f32:
852 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
853 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
854 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000
855 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
856 ; VI-NEXT: s_mov_b32 s2, -1
857 ; VI-NEXT: s_waitcnt lgkmcnt(0)
858 ; VI-NEXT: v_mov_b32_e32 v0, s8
859 ; VI-NEXT: v_mov_b32_e32 v1, s9
860 ; VI-NEXT: v_mov_b32_e32 v2, s10
861 ; VI-NEXT: v_mov_b32_e32 v3, s11
862 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
863 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
864 ; VI-NEXT: v_mov_b32_e32 v4, s12
865 ; VI-NEXT: v_mov_b32_e32 v5, s13
866 ; VI-NEXT: v_mov_b32_e32 v6, s14
867 ; VI-NEXT: v_mov_b32_e32 v7, s15
868 ; VI-NEXT: s_waitcnt lgkmcnt(0)
869 ; VI-NEXT: v_mov_b32_e32 v8, s8
870 ; VI-NEXT: v_mov_b32_e32 v9, s9
871 ; VI-NEXT: v_mov_b32_e32 v10, s10
872 ; VI-NEXT: s_mov_b32 m0, s4
873 ; VI-NEXT: v_movreld_b32_e32 v0, v11
874 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
875 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
876 ; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
878 %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b
879 store <11 x float> %vecins, ptr addrspace(1) %out, align 32
883 define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind {
884 ; SI-LABEL: dynamic_insertelement_v12f32:
886 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
887 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
888 ; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
889 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
890 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000
891 ; SI-NEXT: s_mov_b32 s3, 0x100f000
892 ; SI-NEXT: s_waitcnt lgkmcnt(0)
893 ; SI-NEXT: v_mov_b32_e32 v0, s8
894 ; SI-NEXT: v_mov_b32_e32 v1, s9
895 ; SI-NEXT: v_mov_b32_e32 v2, s10
896 ; SI-NEXT: v_mov_b32_e32 v3, s11
897 ; SI-NEXT: v_mov_b32_e32 v4, s12
898 ; SI-NEXT: v_mov_b32_e32 v5, s13
899 ; SI-NEXT: v_mov_b32_e32 v6, s14
900 ; SI-NEXT: v_mov_b32_e32 v7, s15
901 ; SI-NEXT: v_mov_b32_e32 v8, s16
902 ; SI-NEXT: v_mov_b32_e32 v9, s17
903 ; SI-NEXT: v_mov_b32_e32 v10, s18
904 ; SI-NEXT: v_mov_b32_e32 v11, s19
905 ; SI-NEXT: s_mov_b32 m0, s4
906 ; SI-NEXT: s_mov_b32 s2, -1
907 ; SI-NEXT: v_movreld_b32_e32 v0, v12
908 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
909 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
910 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
913 ; VI-LABEL: dynamic_insertelement_v12f32:
915 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
916 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
917 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000
918 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
919 ; VI-NEXT: s_mov_b32 s2, -1
920 ; VI-NEXT: s_waitcnt lgkmcnt(0)
921 ; VI-NEXT: v_mov_b32_e32 v0, s8
922 ; VI-NEXT: v_mov_b32_e32 v1, s9
923 ; VI-NEXT: v_mov_b32_e32 v2, s10
924 ; VI-NEXT: v_mov_b32_e32 v3, s11
925 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
926 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
927 ; VI-NEXT: v_mov_b32_e32 v4, s12
928 ; VI-NEXT: v_mov_b32_e32 v5, s13
929 ; VI-NEXT: v_mov_b32_e32 v6, s14
930 ; VI-NEXT: v_mov_b32_e32 v7, s15
931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
932 ; VI-NEXT: v_mov_b32_e32 v8, s8
933 ; VI-NEXT: v_mov_b32_e32 v9, s9
934 ; VI-NEXT: v_mov_b32_e32 v10, s10
935 ; VI-NEXT: v_mov_b32_e32 v11, s11
936 ; VI-NEXT: s_mov_b32 m0, s4
937 ; VI-NEXT: v_movreld_b32_e32 v0, v12
938 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
939 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
940 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
942 %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b
943 store <12 x float> %vecins, ptr addrspace(1) %out, align 32
947 define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind {
948 ; SI-LABEL: dynamic_insertelement_v16f32:
950 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
951 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
952 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
953 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000
954 ; SI-NEXT: s_mov_b32 s3, 0x100f000
955 ; SI-NEXT: s_mov_b32 s2, -1
956 ; SI-NEXT: s_waitcnt lgkmcnt(0)
957 ; SI-NEXT: v_mov_b32_e32 v0, s8
958 ; SI-NEXT: v_mov_b32_e32 v1, s9
959 ; SI-NEXT: v_mov_b32_e32 v2, s10
960 ; SI-NEXT: v_mov_b32_e32 v3, s11
961 ; SI-NEXT: v_mov_b32_e32 v4, s12
962 ; SI-NEXT: v_mov_b32_e32 v5, s13
963 ; SI-NEXT: v_mov_b32_e32 v6, s14
964 ; SI-NEXT: v_mov_b32_e32 v7, s15
965 ; SI-NEXT: v_mov_b32_e32 v8, s16
966 ; SI-NEXT: v_mov_b32_e32 v9, s17
967 ; SI-NEXT: v_mov_b32_e32 v10, s18
968 ; SI-NEXT: v_mov_b32_e32 v11, s19
969 ; SI-NEXT: v_mov_b32_e32 v12, s20
970 ; SI-NEXT: v_mov_b32_e32 v13, s21
971 ; SI-NEXT: v_mov_b32_e32 v14, s22
972 ; SI-NEXT: v_mov_b32_e32 v15, s23
973 ; SI-NEXT: s_mov_b32 m0, s4
974 ; SI-NEXT: v_movreld_b32_e32 v0, v16
975 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
976 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
977 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
978 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
981 ; VI-LABEL: dynamic_insertelement_v16f32:
983 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
984 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
985 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
986 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000
987 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
988 ; VI-NEXT: s_mov_b32 s2, -1
989 ; VI-NEXT: s_waitcnt lgkmcnt(0)
990 ; VI-NEXT: v_mov_b32_e32 v0, s8
991 ; VI-NEXT: v_mov_b32_e32 v1, s9
992 ; VI-NEXT: v_mov_b32_e32 v2, s10
993 ; VI-NEXT: v_mov_b32_e32 v3, s11
994 ; VI-NEXT: v_mov_b32_e32 v4, s12
995 ; VI-NEXT: v_mov_b32_e32 v5, s13
996 ; VI-NEXT: v_mov_b32_e32 v6, s14
997 ; VI-NEXT: v_mov_b32_e32 v7, s15
998 ; VI-NEXT: v_mov_b32_e32 v8, s16
999 ; VI-NEXT: v_mov_b32_e32 v9, s17
1000 ; VI-NEXT: v_mov_b32_e32 v10, s18
1001 ; VI-NEXT: v_mov_b32_e32 v11, s19
1002 ; VI-NEXT: v_mov_b32_e32 v12, s20
1003 ; VI-NEXT: v_mov_b32_e32 v13, s21
1004 ; VI-NEXT: v_mov_b32_e32 v14, s22
1005 ; VI-NEXT: v_mov_b32_e32 v15, s23
1006 ; VI-NEXT: s_mov_b32 m0, s4
1007 ; VI-NEXT: v_movreld_b32_e32 v0, v16
1008 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1009 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1010 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1011 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1013 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
1014 store <16 x float> %vecins, ptr addrspace(1) %out, align 64
1018 define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
1019 ; SI-LABEL: dynamic_insertelement_v2i32:
1021 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
1022 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1023 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1024 ; SI-NEXT: s_mov_b32 s6, -1
1025 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1026 ; SI-NEXT: s_cmp_lg_u32 s2, 1
1027 ; SI-NEXT: s_cselect_b32 s1, s1, 5
1028 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1029 ; SI-NEXT: s_cselect_b32 s0, s0, 5
1030 ; SI-NEXT: v_mov_b32_e32 v0, s0
1031 ; SI-NEXT: v_mov_b32_e32 v1, s1
1032 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1035 ; VI-LABEL: dynamic_insertelement_v2i32:
1037 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
1038 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1039 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1040 ; VI-NEXT: s_mov_b32 s6, -1
1041 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1042 ; VI-NEXT: s_cmp_lg_u32 s2, 1
1043 ; VI-NEXT: s_cselect_b32 s1, s1, 5
1044 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1045 ; VI-NEXT: s_cselect_b32 s0, s0, 5
1046 ; VI-NEXT: v_mov_b32_e32 v0, s0
1047 ; VI-NEXT: v_mov_b32_e32 v1, s1
1048 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1050 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
1051 store <2 x i32> %vecins, ptr addrspace(1) %out, align 8
1055 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind {
1056 ; SI-LABEL: dynamic_insertelement_v3i32:
1058 ; SI-NEXT: s_load_dword s8, s[4:5], 0x8
1059 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
1060 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1061 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1062 ; SI-NEXT: s_mov_b32 s6, -1
1063 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1064 ; SI-NEXT: s_cmp_lg_u32 s8, 2
1065 ; SI-NEXT: s_cselect_b32 s2, s2, 5
1066 ; SI-NEXT: s_cmp_lg_u32 s8, 1
1067 ; SI-NEXT: s_cselect_b32 s1, s1, 5
1068 ; SI-NEXT: s_cmp_lg_u32 s8, 0
1069 ; SI-NEXT: s_cselect_b32 s0, s0, 5
1070 ; SI-NEXT: v_mov_b32_e32 v0, s0
1071 ; SI-NEXT: v_mov_b32_e32 v1, s1
1072 ; SI-NEXT: v_mov_b32_e32 v2, s2
1073 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1076 ; VI-LABEL: dynamic_insertelement_v3i32:
1078 ; VI-NEXT: s_load_dword s8, s[4:5], 0x20
1079 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1080 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1081 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1082 ; VI-NEXT: s_mov_b32 s6, -1
1083 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1084 ; VI-NEXT: s_cmp_lg_u32 s8, 2
1085 ; VI-NEXT: s_cselect_b32 s2, s2, 5
1086 ; VI-NEXT: s_cmp_lg_u32 s8, 1
1087 ; VI-NEXT: s_cselect_b32 s1, s1, 5
1088 ; VI-NEXT: s_cmp_lg_u32 s8, 0
1089 ; VI-NEXT: s_cselect_b32 s0, s0, 5
1090 ; VI-NEXT: v_mov_b32_e32 v0, s0
1091 ; VI-NEXT: v_mov_b32_e32 v1, s1
1092 ; VI-NEXT: v_mov_b32_e32 v2, s2
1093 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1095 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
1096 store <3 x i32> %vecins, ptr addrspace(1) %out, align 16
1100 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
1101 ; SI-LABEL: dynamic_insertelement_v4i32:
1103 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
1104 ; SI-NEXT: s_load_dword s8, s[4:5], 0x8
1105 ; SI-NEXT: s_load_dword s9, s[4:5], 0x11
1106 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1107 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1108 ; SI-NEXT: s_mov_b32 s6, -1
1109 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1110 ; SI-NEXT: s_cmp_eq_u32 s8, 3
1111 ; SI-NEXT: s_cselect_b32 s3, s9, s3
1112 ; SI-NEXT: s_cmp_eq_u32 s8, 2
1113 ; SI-NEXT: s_cselect_b32 s2, s9, s2
1114 ; SI-NEXT: s_cmp_eq_u32 s8, 1
1115 ; SI-NEXT: s_cselect_b32 s1, s9, s1
1116 ; SI-NEXT: s_cmp_eq_u32 s8, 0
1117 ; SI-NEXT: s_cselect_b32 s0, s9, s0
1118 ; SI-NEXT: v_mov_b32_e32 v0, s0
1119 ; SI-NEXT: v_mov_b32_e32 v1, s1
1120 ; SI-NEXT: v_mov_b32_e32 v2, s2
1121 ; SI-NEXT: v_mov_b32_e32 v3, s3
1122 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1125 ; VI-LABEL: dynamic_insertelement_v4i32:
1127 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
1128 ; VI-NEXT: s_load_dword s8, s[4:5], 0x20
1129 ; VI-NEXT: s_load_dword s9, s[4:5], 0x44
1130 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1131 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1132 ; VI-NEXT: s_mov_b32 s6, -1
1133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1134 ; VI-NEXT: s_cmp_eq_u32 s8, 3
1135 ; VI-NEXT: s_cselect_b32 s3, s9, s3
1136 ; VI-NEXT: s_cmp_eq_u32 s8, 2
1137 ; VI-NEXT: s_cselect_b32 s2, s9, s2
1138 ; VI-NEXT: s_cmp_eq_u32 s8, 1
1139 ; VI-NEXT: s_cselect_b32 s1, s9, s1
1140 ; VI-NEXT: s_cmp_eq_u32 s8, 0
1141 ; VI-NEXT: s_cselect_b32 s0, s9, s0
1142 ; VI-NEXT: v_mov_b32_e32 v0, s0
1143 ; VI-NEXT: v_mov_b32_e32 v1, s1
1144 ; VI-NEXT: v_mov_b32_e32 v2, s2
1145 ; VI-NEXT: v_mov_b32_e32 v3, s3
1146 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1148 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
1149 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
1153 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind {
1154 ; SI-LABEL: dynamic_insertelement_v8i32:
1156 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1157 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1158 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
1159 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1160 ; SI-NEXT: s_mov_b32 s2, -1
1161 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1162 ; SI-NEXT: v_mov_b32_e32 v0, s8
1163 ; SI-NEXT: v_mov_b32_e32 v1, s9
1164 ; SI-NEXT: v_mov_b32_e32 v2, s10
1165 ; SI-NEXT: v_mov_b32_e32 v3, s11
1166 ; SI-NEXT: v_mov_b32_e32 v4, s12
1167 ; SI-NEXT: v_mov_b32_e32 v5, s13
1168 ; SI-NEXT: v_mov_b32_e32 v6, s14
1169 ; SI-NEXT: v_mov_b32_e32 v7, s15
1170 ; SI-NEXT: s_mov_b32 m0, s4
1171 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1172 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1173 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1176 ; VI-LABEL: dynamic_insertelement_v8i32:
1178 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1179 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1180 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
1181 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1182 ; VI-NEXT: s_mov_b32 s2, -1
1183 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1184 ; VI-NEXT: v_mov_b32_e32 v0, s8
1185 ; VI-NEXT: v_mov_b32_e32 v1, s9
1186 ; VI-NEXT: v_mov_b32_e32 v2, s10
1187 ; VI-NEXT: v_mov_b32_e32 v3, s11
1188 ; VI-NEXT: v_mov_b32_e32 v4, s12
1189 ; VI-NEXT: v_mov_b32_e32 v5, s13
1190 ; VI-NEXT: v_mov_b32_e32 v6, s14
1191 ; VI-NEXT: v_mov_b32_e32 v7, s15
1192 ; VI-NEXT: s_mov_b32 m0, s4
1193 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1194 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1195 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1197 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
1198 store <8 x i32> %vecins, ptr addrspace(1) %out, align 32
1202 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
1203 ; SI-LABEL: dynamic_insertelement_v9i32:
1205 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1206 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
1207 ; SI-NEXT: s_load_dword s6, s[4:5], 0x18
1208 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1209 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1210 ; SI-NEXT: s_mov_b32 s2, -1
1211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1212 ; SI-NEXT: v_mov_b32_e32 v0, s8
1213 ; SI-NEXT: v_mov_b32_e32 v1, s9
1214 ; SI-NEXT: v_mov_b32_e32 v2, s10
1215 ; SI-NEXT: v_mov_b32_e32 v3, s11
1216 ; SI-NEXT: v_mov_b32_e32 v4, s12
1217 ; SI-NEXT: v_mov_b32_e32 v5, s13
1218 ; SI-NEXT: v_mov_b32_e32 v6, s14
1219 ; SI-NEXT: v_mov_b32_e32 v7, s15
1220 ; SI-NEXT: v_mov_b32_e32 v8, s6
1221 ; SI-NEXT: s_mov_b32 m0, s4
1222 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1223 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
1224 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1225 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1228 ; VI-LABEL: dynamic_insertelement_v9i32:
1230 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
1231 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1232 ; VI-NEXT: s_load_dword s6, s[4:5], 0x60
1233 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1234 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1235 ; VI-NEXT: s_mov_b32 s2, -1
1236 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1237 ; VI-NEXT: v_mov_b32_e32 v0, s8
1238 ; VI-NEXT: v_mov_b32_e32 v1, s9
1239 ; VI-NEXT: v_mov_b32_e32 v2, s10
1240 ; VI-NEXT: v_mov_b32_e32 v3, s11
1241 ; VI-NEXT: v_mov_b32_e32 v4, s12
1242 ; VI-NEXT: v_mov_b32_e32 v5, s13
1243 ; VI-NEXT: v_mov_b32_e32 v6, s14
1244 ; VI-NEXT: v_mov_b32_e32 v7, s15
1245 ; VI-NEXT: v_mov_b32_e32 v8, s6
1246 ; VI-NEXT: s_mov_b32 m0, s4
1247 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1248 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
1249 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1250 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1252 %vecins = insertelement <9 x i32> %a, i32 5, i32 %b
1253 store <9 x i32> %vecins, ptr addrspace(1) %out, align 32
1257 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
1258 ; SI-LABEL: dynamic_insertelement_v10i32:
1260 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1261 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
1262 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
1263 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1264 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1265 ; SI-NEXT: s_mov_b32 s2, -1
1266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1267 ; SI-NEXT: v_mov_b32_e32 v0, s8
1268 ; SI-NEXT: v_mov_b32_e32 v1, s9
1269 ; SI-NEXT: v_mov_b32_e32 v2, s10
1270 ; SI-NEXT: v_mov_b32_e32 v3, s11
1271 ; SI-NEXT: v_mov_b32_e32 v4, s12
1272 ; SI-NEXT: v_mov_b32_e32 v5, s13
1273 ; SI-NEXT: v_mov_b32_e32 v6, s14
1274 ; SI-NEXT: v_mov_b32_e32 v7, s15
1275 ; SI-NEXT: v_mov_b32_e32 v8, s6
1276 ; SI-NEXT: v_mov_b32_e32 v9, s7
1277 ; SI-NEXT: s_mov_b32 m0, s4
1278 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1279 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1280 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1281 ; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1284 ; VI-LABEL: dynamic_insertelement_v10i32:
1286 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
1287 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1288 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x60
1289 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1290 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1291 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1292 ; VI-NEXT: v_mov_b32_e32 v0, s8
1293 ; VI-NEXT: v_mov_b32_e32 v1, s9
1294 ; VI-NEXT: v_mov_b32_e32 v2, s10
1295 ; VI-NEXT: v_mov_b32_e32 v3, s11
1296 ; VI-NEXT: v_mov_b32_e32 v4, s12
1297 ; VI-NEXT: v_mov_b32_e32 v5, s13
1298 ; VI-NEXT: v_mov_b32_e32 v6, s14
1299 ; VI-NEXT: v_mov_b32_e32 v7, s15
1300 ; VI-NEXT: v_mov_b32_e32 v8, s6
1301 ; VI-NEXT: v_mov_b32_e32 v9, s7
1302 ; VI-NEXT: s_mov_b32 m0, s4
1303 ; VI-NEXT: s_mov_b32 s2, -1
1304 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1305 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1306 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1307 ; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1309 %vecins = insertelement <10 x i32> %a, i32 5, i32 %b
1310 store <10 x i32> %vecins, ptr addrspace(1) %out, align 32
1314 define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind {
1315 ; SI-LABEL: dynamic_insertelement_v11i32:
1317 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1318 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
1319 ; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
1320 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1321 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1322 ; SI-NEXT: s_mov_b32 s2, -1
1323 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1324 ; SI-NEXT: v_mov_b32_e32 v0, s8
1325 ; SI-NEXT: v_mov_b32_e32 v1, s9
1326 ; SI-NEXT: v_mov_b32_e32 v2, s10
1327 ; SI-NEXT: v_mov_b32_e32 v3, s11
1328 ; SI-NEXT: v_mov_b32_e32 v4, s12
1329 ; SI-NEXT: v_mov_b32_e32 v5, s13
1330 ; SI-NEXT: v_mov_b32_e32 v6, s14
1331 ; SI-NEXT: v_mov_b32_e32 v7, s15
1332 ; SI-NEXT: v_mov_b32_e32 v8, s16
1333 ; SI-NEXT: v_mov_b32_e32 v9, s17
1334 ; SI-NEXT: v_mov_b32_e32 v10, s18
1335 ; SI-NEXT: s_mov_b32 m0, s4
1336 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1337 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1338 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1339 ; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1342 ; VI-LABEL: dynamic_insertelement_v11i32:
1344 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
1345 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1346 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1347 ; VI-NEXT: s_mov_b32 s2, -1
1348 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1349 ; VI-NEXT: v_mov_b32_e32 v0, s8
1350 ; VI-NEXT: v_mov_b32_e32 v1, s9
1351 ; VI-NEXT: v_mov_b32_e32 v2, s10
1352 ; VI-NEXT: v_mov_b32_e32 v3, s11
1353 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
1354 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1355 ; VI-NEXT: v_mov_b32_e32 v4, s12
1356 ; VI-NEXT: v_mov_b32_e32 v5, s13
1357 ; VI-NEXT: v_mov_b32_e32 v6, s14
1358 ; VI-NEXT: v_mov_b32_e32 v7, s15
1359 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1360 ; VI-NEXT: v_mov_b32_e32 v8, s8
1361 ; VI-NEXT: v_mov_b32_e32 v9, s9
1362 ; VI-NEXT: v_mov_b32_e32 v10, s10
1363 ; VI-NEXT: s_mov_b32 m0, s4
1364 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1365 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1366 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1367 ; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1369 %vecins = insertelement <11 x i32> %a, i32 5, i32 %b
1370 store <11 x i32> %vecins, ptr addrspace(1) %out, align 32
1374 define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind {
1375 ; SI-LABEL: dynamic_insertelement_v12i32:
1377 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1378 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
1379 ; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x18
1380 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1381 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1382 ; SI-NEXT: s_mov_b32 s2, -1
1383 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1384 ; SI-NEXT: v_mov_b32_e32 v0, s8
1385 ; SI-NEXT: v_mov_b32_e32 v1, s9
1386 ; SI-NEXT: v_mov_b32_e32 v2, s10
1387 ; SI-NEXT: v_mov_b32_e32 v3, s11
1388 ; SI-NEXT: v_mov_b32_e32 v4, s12
1389 ; SI-NEXT: v_mov_b32_e32 v5, s13
1390 ; SI-NEXT: v_mov_b32_e32 v6, s14
1391 ; SI-NEXT: v_mov_b32_e32 v7, s15
1392 ; SI-NEXT: v_mov_b32_e32 v8, s16
1393 ; SI-NEXT: v_mov_b32_e32 v9, s17
1394 ; SI-NEXT: v_mov_b32_e32 v10, s18
1395 ; SI-NEXT: v_mov_b32_e32 v11, s19
1396 ; SI-NEXT: s_mov_b32 m0, s4
1397 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1398 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1399 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1400 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1403 ; VI-LABEL: dynamic_insertelement_v12i32:
1405 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
1406 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1407 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1408 ; VI-NEXT: s_mov_b32 s2, -1
1409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1410 ; VI-NEXT: v_mov_b32_e32 v0, s8
1411 ; VI-NEXT: v_mov_b32_e32 v1, s9
1412 ; VI-NEXT: v_mov_b32_e32 v2, s10
1413 ; VI-NEXT: v_mov_b32_e32 v3, s11
1414 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x60
1415 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1416 ; VI-NEXT: v_mov_b32_e32 v4, s12
1417 ; VI-NEXT: v_mov_b32_e32 v5, s13
1418 ; VI-NEXT: v_mov_b32_e32 v6, s14
1419 ; VI-NEXT: v_mov_b32_e32 v7, s15
1420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1421 ; VI-NEXT: v_mov_b32_e32 v8, s8
1422 ; VI-NEXT: v_mov_b32_e32 v9, s9
1423 ; VI-NEXT: v_mov_b32_e32 v10, s10
1424 ; VI-NEXT: v_mov_b32_e32 v11, s11
1425 ; VI-NEXT: s_mov_b32 m0, s4
1426 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1427 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1428 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1429 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1431 %vecins = insertelement <12 x i32> %a, i32 5, i32 %b
1432 store <12 x i32> %vecins, ptr addrspace(1) %out, align 32
1436 define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind {
1437 ; SI-LABEL: dynamic_insertelement_v16i32:
1439 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
1440 ; SI-NEXT: s_load_dword s6, s[4:5], 0x20
1441 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1442 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1443 ; SI-NEXT: s_mov_b32 s2, -1
1444 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1445 ; SI-NEXT: v_mov_b32_e32 v0, s8
1446 ; SI-NEXT: v_mov_b32_e32 v1, s9
1447 ; SI-NEXT: v_mov_b32_e32 v2, s10
1448 ; SI-NEXT: v_mov_b32_e32 v3, s11
1449 ; SI-NEXT: v_mov_b32_e32 v4, s12
1450 ; SI-NEXT: v_mov_b32_e32 v5, s13
1451 ; SI-NEXT: v_mov_b32_e32 v6, s14
1452 ; SI-NEXT: v_mov_b32_e32 v7, s15
1453 ; SI-NEXT: v_mov_b32_e32 v8, s16
1454 ; SI-NEXT: v_mov_b32_e32 v9, s17
1455 ; SI-NEXT: v_mov_b32_e32 v10, s18
1456 ; SI-NEXT: v_mov_b32_e32 v11, s19
1457 ; SI-NEXT: v_mov_b32_e32 v12, s20
1458 ; SI-NEXT: v_mov_b32_e32 v13, s21
1459 ; SI-NEXT: v_mov_b32_e32 v14, s22
1460 ; SI-NEXT: v_mov_b32_e32 v15, s23
1461 ; SI-NEXT: s_mov_b32 m0, s6
1462 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1463 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1464 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1465 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1466 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1469 ; VI-LABEL: dynamic_insertelement_v16i32:
1471 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
1472 ; VI-NEXT: s_load_dword s6, s[4:5], 0x80
1473 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1474 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1475 ; VI-NEXT: s_mov_b32 s2, -1
1476 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1477 ; VI-NEXT: v_mov_b32_e32 v0, s8
1478 ; VI-NEXT: v_mov_b32_e32 v1, s9
1479 ; VI-NEXT: v_mov_b32_e32 v2, s10
1480 ; VI-NEXT: v_mov_b32_e32 v3, s11
1481 ; VI-NEXT: v_mov_b32_e32 v4, s12
1482 ; VI-NEXT: v_mov_b32_e32 v5, s13
1483 ; VI-NEXT: v_mov_b32_e32 v6, s14
1484 ; VI-NEXT: v_mov_b32_e32 v7, s15
1485 ; VI-NEXT: v_mov_b32_e32 v8, s16
1486 ; VI-NEXT: v_mov_b32_e32 v9, s17
1487 ; VI-NEXT: v_mov_b32_e32 v10, s18
1488 ; VI-NEXT: v_mov_b32_e32 v11, s19
1489 ; VI-NEXT: v_mov_b32_e32 v12, s20
1490 ; VI-NEXT: v_mov_b32_e32 v13, s21
1491 ; VI-NEXT: v_mov_b32_e32 v14, s22
1492 ; VI-NEXT: v_mov_b32_e32 v15, s23
1493 ; VI-NEXT: s_mov_b32 m0, s6
1494 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1495 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1496 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1497 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1498 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1500 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
1501 store <16 x i32> %vecins, ptr addrspace(1) %out, align 64
1505 define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind {
1506 ; SI-LABEL: dynamic_insertelement_v2i16:
1508 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1509 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1510 ; SI-NEXT: s_mov_b32 s6, -1
1511 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1512 ; SI-NEXT: s_mov_b32 s4, s0
1513 ; SI-NEXT: s_lshl_b32 s0, s3, 4
1514 ; SI-NEXT: s_lshl_b32 s0, 0xffff, s0
1515 ; SI-NEXT: s_mov_b32 s5, s1
1516 ; SI-NEXT: s_andn2_b32 s1, s2, s0
1517 ; SI-NEXT: s_and_b32 s0, s0, 0x50005
1518 ; SI-NEXT: s_or_b32 s0, s0, s1
1519 ; SI-NEXT: v_mov_b32_e32 v0, s0
1520 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1523 ; VI-LABEL: dynamic_insertelement_v2i16:
1525 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1526 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1527 ; VI-NEXT: s_mov_b32 s6, -1
1528 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1529 ; VI-NEXT: s_mov_b32 s4, s0
1530 ; VI-NEXT: s_lshl_b32 s0, s3, 4
1531 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1532 ; VI-NEXT: s_mov_b32 s5, s1
1533 ; VI-NEXT: s_andn2_b32 s1, s2, s0
1534 ; VI-NEXT: s_and_b32 s0, s0, 0x50005
1535 ; VI-NEXT: s_or_b32 s0, s0, s1
1536 ; VI-NEXT: v_mov_b32_e32 v0, s0
1537 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1539 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1540 store <2 x i16> %vecins, ptr addrspace(1) %out, align 8
1544 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
1545 ; SI-LABEL: dynamic_insertelement_v3i16:
1547 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1548 ; SI-NEXT: s_load_dword s8, s[4:5], 0x4
1549 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1550 ; SI-NEXT: s_mov_b32 s6, -1
1551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; SI-NEXT: s_mov_b32 s4, s0
1553 ; SI-NEXT: s_lshl_b32 s0, s8, 4
1554 ; SI-NEXT: s_mov_b32 s5, s1
1555 ; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
1556 ; SI-NEXT: s_and_b32 s9, s1, 0x50005
1557 ; SI-NEXT: s_and_b32 s8, s0, 0x50005
1558 ; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1]
1559 ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
1560 ; SI-NEXT: v_mov_b32_e32 v0, s1
1561 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1562 ; SI-NEXT: v_mov_b32_e32 v0, s0
1563 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1566 ; VI-LABEL: dynamic_insertelement_v3i16:
1568 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1569 ; VI-NEXT: s_load_dword s8, s[4:5], 0x10
1570 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1571 ; VI-NEXT: s_mov_b32 s6, -1
1572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1573 ; VI-NEXT: s_mov_b32 s4, s0
1574 ; VI-NEXT: s_lshl_b32 s0, s8, 4
1575 ; VI-NEXT: s_mov_b32 s8, 0x50005
1576 ; VI-NEXT: s_mov_b32 s5, s1
1577 ; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
1578 ; VI-NEXT: s_mov_b32 s9, s8
1579 ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1580 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9]
1581 ; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1582 ; VI-NEXT: v_mov_b32_e32 v0, s1
1583 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1584 ; VI-NEXT: v_mov_b32_e32 v0, s0
1585 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1587 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1588 store <3 x i16> %vecins, ptr addrspace(1) %out, align 8
1592 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1593 ; SI-LABEL: dynamic_insertelement_v2i8:
1595 ; SI-NEXT: s_load_dword s6, s[4:5], 0x13
1596 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1597 ; SI-NEXT: s_load_dword s4, s[4:5], 0xa
1598 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1599 ; SI-NEXT: s_mov_b32 s2, -1
1600 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1601 ; SI-NEXT: s_lshl_b32 s5, s6, 3
1602 ; SI-NEXT: s_lshl_b32 s5, 0xff, s5
1603 ; SI-NEXT: s_andn2_b32 s4, s4, s5
1604 ; SI-NEXT: s_and_b32 s5, s5, 0x505
1605 ; SI-NEXT: s_or_b32 s4, s5, s4
1606 ; SI-NEXT: v_mov_b32_e32 v0, s4
1607 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1610 ; VI-LABEL: dynamic_insertelement_v2i8:
1612 ; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
1613 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1614 ; VI-NEXT: s_load_dword s4, s[4:5], 0x28
1615 ; VI-NEXT: v_mov_b32_e32 v0, 0xff
1616 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1617 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1618 ; VI-NEXT: s_lshl_b32 s5, s6, 3
1619 ; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0
1620 ; VI-NEXT: v_not_b32_e32 v1, v0
1621 ; VI-NEXT: v_and_b32_e32 v1, s4, v1
1622 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
1623 ; VI-NEXT: s_mov_b32 s2, -1
1624 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1625 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1627 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1628 store <2 x i8> %vecins, ptr addrspace(1) %out, align 8
1632 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
1633 ; isTypeDesirableForOp in SimplifyDemandedBits
1634 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1635 ; SI-LABEL: dynamic_insertelement_v3i8:
1637 ; SI-NEXT: s_load_dword s6, s[4:5], 0x13
1638 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1639 ; SI-NEXT: s_load_dword s4, s[4:5], 0xa
1640 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1641 ; SI-NEXT: s_mov_b32 s2, -1
1642 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1643 ; SI-NEXT: s_lshl_b32 s5, s6, 3
1644 ; SI-NEXT: s_lshl_b32 s5, 0xff, s5
1645 ; SI-NEXT: s_andn2_b32 s4, s4, s5
1646 ; SI-NEXT: s_and_b32 s5, s5, 0x5050505
1647 ; SI-NEXT: s_or_b32 s4, s5, s4
1648 ; SI-NEXT: s_lshr_b32 s5, s4, 16
1649 ; SI-NEXT: v_mov_b32_e32 v0, s4
1650 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1651 ; SI-NEXT: v_mov_b32_e32 v0, s5
1652 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1655 ; VI-LABEL: dynamic_insertelement_v3i8:
1657 ; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
1658 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1659 ; VI-NEXT: s_load_dword s4, s[4:5], 0x28
1660 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1661 ; VI-NEXT: s_mov_b32 s2, -1
1662 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1663 ; VI-NEXT: s_lshl_b32 s5, s6, 3
1664 ; VI-NEXT: s_lshl_b32 s5, 0xff, s5
1665 ; VI-NEXT: s_andn2_b32 s4, s4, s5
1666 ; VI-NEXT: s_and_b32 s5, s5, 0x5050505
1667 ; VI-NEXT: s_or_b32 s4, s5, s4
1668 ; VI-NEXT: s_lshr_b32 s5, s4, 16
1669 ; VI-NEXT: v_mov_b32_e32 v0, s4
1670 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1671 ; VI-NEXT: v_mov_b32_e32 v0, s5
1672 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1674 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1675 store <3 x i8> %vecins, ptr addrspace(1) %out, align 4
1679 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1680 ; SI-LABEL: dynamic_insertelement_v4i8:
1682 ; SI-NEXT: s_load_dword s6, s[4:5], 0x13
1683 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1684 ; SI-NEXT: s_load_dword s4, s[4:5], 0xa
1685 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1686 ; SI-NEXT: s_mov_b32 s2, -1
1687 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1688 ; SI-NEXT: s_lshl_b32 s5, s6, 3
1689 ; SI-NEXT: s_lshl_b32 s5, 0xff, s5
1690 ; SI-NEXT: s_andn2_b32 s4, s4, s5
1691 ; SI-NEXT: s_and_b32 s5, s5, 0x5050505
1692 ; SI-NEXT: s_or_b32 s4, s5, s4
1693 ; SI-NEXT: v_mov_b32_e32 v0, s4
1694 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1697 ; VI-LABEL: dynamic_insertelement_v4i8:
1699 ; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
1700 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1701 ; VI-NEXT: s_load_dword s4, s[4:5], 0x28
1702 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1703 ; VI-NEXT: s_mov_b32 s2, -1
1704 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1705 ; VI-NEXT: s_lshl_b32 s5, s6, 3
1706 ; VI-NEXT: s_lshl_b32 s5, 0xff, s5
1707 ; VI-NEXT: s_andn2_b32 s4, s4, s5
1708 ; VI-NEXT: s_and_b32 s5, s5, 0x5050505
1709 ; VI-NEXT: s_or_b32 s4, s5, s4
1710 ; VI-NEXT: v_mov_b32_e32 v0, s4
1711 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1713 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1714 store <4 x i8> %vecins, ptr addrspace(1) %out, align 4
1718 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind {
1719 ; SI-LABEL: s_dynamic_insertelement_v8i8:
1721 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1722 ; SI-NEXT: s_load_dword s8, s[4:5], 0x4
1723 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1724 ; SI-NEXT: s_mov_b32 s6, -1
1725 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1726 ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1727 ; SI-NEXT: s_mov_b32 s4, s0
1728 ; SI-NEXT: s_lshl_b32 s0, s8, 3
1729 ; SI-NEXT: s_mov_b32 s5, s1
1730 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
1731 ; SI-NEXT: s_and_b32 s9, s1, 0x5050505
1732 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1733 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1734 ; SI-NEXT: s_and_b32 s8, s0, 0x5050505
1735 ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
1736 ; SI-NEXT: v_mov_b32_e32 v0, s0
1737 ; SI-NEXT: v_mov_b32_e32 v1, s1
1738 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1741 ; VI-LABEL: s_dynamic_insertelement_v8i8:
1743 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1744 ; VI-NEXT: s_load_dword s8, s[4:5], 0x10
1745 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1746 ; VI-NEXT: s_mov_b32 s6, -1
1747 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1748 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1749 ; VI-NEXT: s_mov_b32 s4, s0
1750 ; VI-NEXT: s_lshl_b32 s0, s8, 3
1751 ; VI-NEXT: s_mov_b32 s5, s1
1752 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
1753 ; VI-NEXT: s_and_b32 s9, s1, 0x5050505
1754 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1755 ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1756 ; VI-NEXT: s_and_b32 s8, s0, 0x5050505
1757 ; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
1758 ; VI-NEXT: v_mov_b32_e32 v0, s0
1759 ; VI-NEXT: v_mov_b32_e32 v1, s1
1760 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1762 %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4
1763 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1764 store <8 x i8> %vecins, ptr addrspace(1) %out, align 8
1768 define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind {
1769 ; SI-LABEL: dynamic_insertelement_v16i8:
1771 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1772 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
1773 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1774 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1775 ; SI-NEXT: s_mov_b32 s2, -1
1776 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1777 ; SI-NEXT: s_lshr_b32 s4, s11, 24
1778 ; SI-NEXT: s_cmp_lg_u32 s6, 15
1779 ; SI-NEXT: s_cselect_b32 s4, s4, 5
1780 ; SI-NEXT: s_lshl_b32 s4, s4, 24
1781 ; SI-NEXT: s_lshr_b32 s5, s11, 16
1782 ; SI-NEXT: s_cmp_lg_u32 s6, 14
1783 ; SI-NEXT: s_cselect_b32 s5, s5, 5
1784 ; SI-NEXT: s_and_b32 s5, s5, 0xff
1785 ; SI-NEXT: s_lshl_b32 s5, s5, 16
1786 ; SI-NEXT: s_or_b32 s4, s4, s5
1787 ; SI-NEXT: s_lshr_b32 s5, s11, 8
1788 ; SI-NEXT: s_cmp_lg_u32 s6, 13
1789 ; SI-NEXT: s_cselect_b32 s5, s5, 5
1790 ; SI-NEXT: s_lshl_b32 s5, s5, 8
1791 ; SI-NEXT: s_cmp_lg_u32 s6, 12
1792 ; SI-NEXT: s_cselect_b32 s7, s11, 5
1793 ; SI-NEXT: s_and_b32 s7, s7, 0xff
1794 ; SI-NEXT: s_or_b32 s5, s7, s5
1795 ; SI-NEXT: s_and_b32 s5, s5, 0xffff
1796 ; SI-NEXT: s_or_b32 s4, s5, s4
1797 ; SI-NEXT: s_lshr_b32 s5, s10, 24
1798 ; SI-NEXT: s_cmp_lg_u32 s6, 11
1799 ; SI-NEXT: s_cselect_b32 s5, s5, 5
1800 ; SI-NEXT: s_lshl_b32 s5, s5, 24
1801 ; SI-NEXT: s_lshr_b32 s7, s10, 16
1802 ; SI-NEXT: s_cmp_lg_u32 s6, 10
1803 ; SI-NEXT: s_cselect_b32 s7, s7, 5
1804 ; SI-NEXT: s_and_b32 s7, s7, 0xff
1805 ; SI-NEXT: s_lshl_b32 s7, s7, 16
1806 ; SI-NEXT: s_or_b32 s5, s5, s7
1807 ; SI-NEXT: s_lshr_b32 s7, s10, 8
1808 ; SI-NEXT: s_cmp_lg_u32 s6, 9
1809 ; SI-NEXT: s_cselect_b32 s7, s7, 5
1810 ; SI-NEXT: s_lshl_b32 s7, s7, 8
1811 ; SI-NEXT: s_cmp_lg_u32 s6, 8
1812 ; SI-NEXT: s_cselect_b32 s10, s10, 5
1813 ; SI-NEXT: s_and_b32 s10, s10, 0xff
1814 ; SI-NEXT: s_or_b32 s7, s10, s7
1815 ; SI-NEXT: s_and_b32 s7, s7, 0xffff
1816 ; SI-NEXT: s_or_b32 s5, s7, s5
1817 ; SI-NEXT: s_lshr_b32 s7, s9, 24
1818 ; SI-NEXT: s_cmp_lg_u32 s6, 7
1819 ; SI-NEXT: s_cselect_b32 s7, s7, 5
1820 ; SI-NEXT: s_lshl_b32 s7, s7, 24
1821 ; SI-NEXT: s_lshr_b32 s10, s9, 16
1822 ; SI-NEXT: s_cmp_lg_u32 s6, 6
1823 ; SI-NEXT: s_cselect_b32 s10, s10, 5
1824 ; SI-NEXT: s_and_b32 s10, s10, 0xff
1825 ; SI-NEXT: s_lshl_b32 s10, s10, 16
1826 ; SI-NEXT: s_or_b32 s7, s7, s10
1827 ; SI-NEXT: s_lshr_b32 s10, s9, 8
1828 ; SI-NEXT: s_cmp_lg_u32 s6, 5
1829 ; SI-NEXT: s_cselect_b32 s10, s10, 5
1830 ; SI-NEXT: s_lshl_b32 s10, s10, 8
1831 ; SI-NEXT: s_cmp_lg_u32 s6, 4
1832 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1833 ; SI-NEXT: s_and_b32 s9, s9, 0xff
1834 ; SI-NEXT: s_or_b32 s9, s9, s10
1835 ; SI-NEXT: s_and_b32 s9, s9, 0xffff
1836 ; SI-NEXT: s_or_b32 s7, s9, s7
1837 ; SI-NEXT: s_lshr_b32 s9, s8, 24
1838 ; SI-NEXT: s_cmp_lg_u32 s6, 3
1839 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1840 ; SI-NEXT: s_lshl_b32 s9, s9, 24
1841 ; SI-NEXT: s_lshr_b32 s10, s8, 16
1842 ; SI-NEXT: s_cmp_lg_u32 s6, 2
1843 ; SI-NEXT: s_cselect_b32 s10, s10, 5
1844 ; SI-NEXT: s_and_b32 s10, s10, 0xff
1845 ; SI-NEXT: s_lshl_b32 s10, s10, 16
1846 ; SI-NEXT: s_or_b32 s9, s9, s10
1847 ; SI-NEXT: s_lshr_b32 s10, s8, 8
1848 ; SI-NEXT: s_cmp_lg_u32 s6, 1
1849 ; SI-NEXT: s_cselect_b32 s10, s10, 5
1850 ; SI-NEXT: s_lshl_b32 s10, s10, 8
1851 ; SI-NEXT: s_cmp_lg_u32 s6, 0
1852 ; SI-NEXT: s_cselect_b32 s6, s8, 5
1853 ; SI-NEXT: s_and_b32 s6, s6, 0xff
1854 ; SI-NEXT: s_or_b32 s6, s6, s10
1855 ; SI-NEXT: s_and_b32 s6, s6, 0xffff
1856 ; SI-NEXT: s_or_b32 s6, s6, s9
1857 ; SI-NEXT: v_mov_b32_e32 v0, s6
1858 ; SI-NEXT: v_mov_b32_e32 v1, s7
1859 ; SI-NEXT: v_mov_b32_e32 v2, s5
1860 ; SI-NEXT: v_mov_b32_e32 v3, s4
1861 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1864 ; VI-LABEL: dynamic_insertelement_v16i8:
1866 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1867 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
1868 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1869 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1870 ; VI-NEXT: s_mov_b32 s2, -1
1871 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1872 ; VI-NEXT: s_lshr_b32 s4, s11, 24
1873 ; VI-NEXT: s_cmp_lg_u32 s6, 15
1874 ; VI-NEXT: v_mov_b32_e32 v0, s4
1875 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1876 ; VI-NEXT: s_lshr_b32 s4, s11, 16
1877 ; VI-NEXT: s_cmp_lg_u32 s6, 14
1878 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1879 ; VI-NEXT: v_mov_b32_e32 v1, s4
1880 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1881 ; VI-NEXT: s_lshr_b32 s4, s11, 8
1882 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1883 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1884 ; VI-NEXT: s_cmp_lg_u32 s6, 13
1885 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1886 ; VI-NEXT: v_mov_b32_e32 v1, s4
1887 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1888 ; VI-NEXT: s_cmp_lg_u32 s6, 12
1889 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1890 ; VI-NEXT: v_mov_b32_e32 v2, s11
1891 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1892 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1893 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1894 ; VI-NEXT: s_lshr_b32 s4, s10, 24
1895 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1896 ; VI-NEXT: s_cmp_lg_u32 s6, 11
1897 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1898 ; VI-NEXT: v_mov_b32_e32 v0, s4
1899 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1900 ; VI-NEXT: s_lshr_b32 s4, s10, 16
1901 ; VI-NEXT: s_cmp_lg_u32 s6, 10
1902 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1903 ; VI-NEXT: v_mov_b32_e32 v1, s4
1904 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1905 ; VI-NEXT: s_lshr_b32 s4, s10, 8
1906 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1907 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1908 ; VI-NEXT: s_cmp_lg_u32 s6, 9
1909 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1910 ; VI-NEXT: v_mov_b32_e32 v1, s4
1911 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1912 ; VI-NEXT: s_cmp_lg_u32 s6, 8
1913 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1914 ; VI-NEXT: v_mov_b32_e32 v2, s10
1915 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1916 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1917 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1918 ; VI-NEXT: s_lshr_b32 s4, s9, 24
1919 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1920 ; VI-NEXT: s_cmp_lg_u32 s6, 7
1921 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1922 ; VI-NEXT: v_mov_b32_e32 v0, s4
1923 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1924 ; VI-NEXT: s_lshr_b32 s4, s9, 16
1925 ; VI-NEXT: s_cmp_lg_u32 s6, 6
1926 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1927 ; VI-NEXT: v_mov_b32_e32 v1, s4
1928 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1929 ; VI-NEXT: s_lshr_b32 s4, s9, 8
1930 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1931 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1932 ; VI-NEXT: s_cmp_lg_u32 s6, 5
1933 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1934 ; VI-NEXT: v_mov_b32_e32 v1, s4
1935 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1936 ; VI-NEXT: s_cmp_lg_u32 s6, 4
1937 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1938 ; VI-NEXT: v_mov_b32_e32 v4, s9
1939 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1940 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1941 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1942 ; VI-NEXT: s_lshr_b32 s4, s8, 24
1943 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1944 ; VI-NEXT: s_cmp_lg_u32 s6, 3
1945 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1946 ; VI-NEXT: v_mov_b32_e32 v0, s4
1947 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1948 ; VI-NEXT: s_lshr_b32 s4, s8, 16
1949 ; VI-NEXT: s_cmp_lg_u32 s6, 2
1950 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1951 ; VI-NEXT: v_mov_b32_e32 v4, s4
1952 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1953 ; VI-NEXT: s_lshr_b32 s4, s8, 8
1954 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1955 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1956 ; VI-NEXT: s_cmp_lg_u32 s6, 1
1957 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1958 ; VI-NEXT: v_mov_b32_e32 v4, s4
1959 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1960 ; VI-NEXT: s_cmp_lg_u32 s6, 0
1961 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1962 ; VI-NEXT: v_mov_b32_e32 v5, s8
1963 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1964 ; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1965 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1966 ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1967 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1968 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1970 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1971 store <16 x i8> %vecins, ptr addrspace(1) %out, align 16
1975 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
1976 ; the compiler doesn't crash.
1977 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) {
1978 ; SI-LABEL: insert_split_bb:
1979 ; SI: ; %bb.0: ; %entry
1980 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
1981 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1982 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1983 ; SI-NEXT: s_cmp_lg_u32 s6, 0
1984 ; SI-NEXT: s_cbranch_scc0 .LBB42_4
1985 ; SI-NEXT: ; %bb.1: ; %else
1986 ; SI-NEXT: s_load_dword s7, s[2:3], 0x1
1987 ; SI-NEXT: s_mov_b64 s[4:5], 0
1988 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
1989 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1990 ; SI-NEXT: s_mov_b64 vcc, vcc
1991 ; SI-NEXT: s_cbranch_vccnz .LBB42_3
1992 ; SI-NEXT: .LBB42_2: ; %if
1993 ; SI-NEXT: s_load_dword s7, s[2:3], 0x0
1994 ; SI-NEXT: .LBB42_3: ; %endif
1995 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1996 ; SI-NEXT: v_mov_b32_e32 v0, s6
1997 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1998 ; SI-NEXT: s_mov_b32 s2, -1
1999 ; SI-NEXT: v_mov_b32_e32 v1, s7
2000 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2002 ; SI-NEXT: .LBB42_4:
2003 ; SI-NEXT: s_branch .LBB42_2
2005 ; VI-LABEL: insert_split_bb:
2006 ; VI: ; %bb.0: ; %entry
2007 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
2008 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2009 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2010 ; VI-NEXT: s_cmp_lg_u32 s6, 0
2011 ; VI-NEXT: s_cbranch_scc0 .LBB42_4
2012 ; VI-NEXT: ; %bb.1: ; %else
2013 ; VI-NEXT: s_load_dword s7, s[2:3], 0x4
2014 ; VI-NEXT: s_cbranch_execnz .LBB42_3
2015 ; VI-NEXT: .LBB42_2: ; %if
2016 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2017 ; VI-NEXT: s_load_dword s7, s[2:3], 0x0
2018 ; VI-NEXT: .LBB42_3: ; %endif
2019 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2020 ; VI-NEXT: v_mov_b32_e32 v0, s6
2021 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2022 ; VI-NEXT: s_mov_b32 s2, -1
2023 ; VI-NEXT: v_mov_b32_e32 v1, s7
2024 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2026 ; VI-NEXT: .LBB42_4:
2027 ; VI-NEXT: s_branch .LBB42_2
2029 %0 = insertelement <2 x i32> undef, i32 %a, i32 0
2030 %1 = icmp eq i32 %a, 0
2031 br i1 %1, label %if, label %else
2034 %2 = load i32, ptr addrspace(1) %in
2035 %3 = insertelement <2 x i32> %0, i32 %2, i32 1
2039 %4 = getelementptr i32, ptr addrspace(1) %in, i32 1
2040 %5 = load i32, ptr addrspace(1) %4
2041 %6 = insertelement <2 x i32> %0, i32 %5, i32 1
2045 %7 = phi <2 x i32> [%3, %if], [%6, %else]
2046 store <2 x i32> %7, ptr addrspace(1) %out
2050 define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
2051 ; SI-LABEL: dynamic_insertelement_v2f64:
2053 ; SI-NEXT: s_load_dword s8, s[4:5], 0x18
2054 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc
2055 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2056 ; SI-NEXT: s_mov_b32 s7, 0x100f000
2057 ; SI-NEXT: s_mov_b32 s6, -1
2058 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2059 ; SI-NEXT: s_cmp_eq_u32 s8, 1
2060 ; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2061 ; SI-NEXT: s_cselect_b32 s2, 0, s2
2062 ; SI-NEXT: s_cmp_eq_u32 s8, 0
2063 ; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2064 ; SI-NEXT: s_cselect_b32 s0, 0, s0
2065 ; SI-NEXT: v_mov_b32_e32 v0, s0
2066 ; SI-NEXT: v_mov_b32_e32 v1, s1
2067 ; SI-NEXT: v_mov_b32_e32 v2, s2
2068 ; SI-NEXT: v_mov_b32_e32 v3, s3
2069 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2072 ; VI-LABEL: dynamic_insertelement_v2f64:
2074 ; VI-NEXT: s_load_dword s8, s[4:5], 0x60
2075 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30
2076 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2077 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
2078 ; VI-NEXT: s_mov_b32 s6, -1
2079 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2080 ; VI-NEXT: s_cmp_eq_u32 s8, 1
2081 ; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2082 ; VI-NEXT: s_cselect_b32 s2, 0, s2
2083 ; VI-NEXT: s_cmp_eq_u32 s8, 0
2084 ; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2085 ; VI-NEXT: s_cselect_b32 s0, 0, s0
2086 ; VI-NEXT: v_mov_b32_e32 v0, s0
2087 ; VI-NEXT: v_mov_b32_e32 v1, s1
2088 ; VI-NEXT: v_mov_b32_e32 v2, s2
2089 ; VI-NEXT: v_mov_b32_e32 v3, s3
2090 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2092 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
2093 store <2 x double> %vecins, ptr addrspace(1) %out, align 16
2097 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind {
2098 ; SI-LABEL: dynamic_insertelement_v2i64:
2100 ; SI-NEXT: s_load_dword s8, s[4:5], 0x8
2101 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
2102 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2103 ; SI-NEXT: s_mov_b32 s7, 0x100f000
2104 ; SI-NEXT: s_mov_b32 s6, -1
2105 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2106 ; SI-NEXT: s_cmp_eq_u32 s8, 1
2107 ; SI-NEXT: s_cselect_b32 s3, 0, s3
2108 ; SI-NEXT: s_cselect_b32 s2, 5, s2
2109 ; SI-NEXT: s_cmp_eq_u32 s8, 0
2110 ; SI-NEXT: s_cselect_b32 s1, 0, s1
2111 ; SI-NEXT: s_cselect_b32 s0, 5, s0
2112 ; SI-NEXT: v_mov_b32_e32 v0, s0
2113 ; SI-NEXT: v_mov_b32_e32 v1, s1
2114 ; SI-NEXT: v_mov_b32_e32 v2, s2
2115 ; SI-NEXT: v_mov_b32_e32 v3, s3
2116 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2119 ; VI-LABEL: dynamic_insertelement_v2i64:
2121 ; VI-NEXT: s_load_dword s8, s[4:5], 0x20
2122 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
2123 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2124 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
2125 ; VI-NEXT: s_mov_b32 s6, -1
2126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2127 ; VI-NEXT: s_cmp_eq_u32 s8, 1
2128 ; VI-NEXT: s_cselect_b32 s3, 0, s3
2129 ; VI-NEXT: s_cselect_b32 s2, 5, s2
2130 ; VI-NEXT: s_cmp_eq_u32 s8, 0
2131 ; VI-NEXT: s_cselect_b32 s1, 0, s1
2132 ; VI-NEXT: s_cselect_b32 s0, 5, s0
2133 ; VI-NEXT: v_mov_b32_e32 v0, s0
2134 ; VI-NEXT: v_mov_b32_e32 v1, s1
2135 ; VI-NEXT: v_mov_b32_e32 v2, s2
2136 ; VI-NEXT: v_mov_b32_e32 v3, s3
2137 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2139 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
2140 store <2 x i64> %vecins, ptr addrspace(1) %out, align 8
2144 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind {
2145 ; SI-LABEL: dynamic_insertelement_v3i64:
2147 ; SI-NEXT: s_load_dword s6, s[4:5], 0x10
2148 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2149 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
2150 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc
2151 ; SI-NEXT: s_mov_b32 s3, 0x100f000
2152 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2153 ; SI-NEXT: s_cmp_eq_u32 s6, 1
2154 ; SI-NEXT: s_mov_b32 s2, -1
2155 ; SI-NEXT: s_cselect_b32 s7, 0, s11
2156 ; SI-NEXT: s_cselect_b32 s10, 5, s10
2157 ; SI-NEXT: s_cmp_eq_u32 s6, 0
2158 ; SI-NEXT: s_cselect_b32 s9, 0, s9
2159 ; SI-NEXT: s_cselect_b32 s8, 5, s8
2160 ; SI-NEXT: s_cmp_eq_u32 s6, 2
2161 ; SI-NEXT: s_cselect_b32 s5, 0, s5
2162 ; SI-NEXT: s_cselect_b32 s4, 5, s4
2163 ; SI-NEXT: v_mov_b32_e32 v0, s4
2164 ; SI-NEXT: v_mov_b32_e32 v1, s5
2165 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2166 ; SI-NEXT: v_mov_b32_e32 v0, s8
2167 ; SI-NEXT: v_mov_b32_e32 v1, s9
2168 ; SI-NEXT: v_mov_b32_e32 v2, s10
2169 ; SI-NEXT: v_mov_b32_e32 v3, s7
2170 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2173 ; VI-LABEL: dynamic_insertelement_v3i64:
2175 ; VI-NEXT: s_load_dword s6, s[4:5], 0x40
2176 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2177 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
2178 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30
2179 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2180 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2181 ; VI-NEXT: s_cmp_eq_u32 s6, 1
2182 ; VI-NEXT: s_mov_b32 s2, -1
2183 ; VI-NEXT: s_cselect_b32 s7, 0, s11
2184 ; VI-NEXT: s_cselect_b32 s10, 5, s10
2185 ; VI-NEXT: s_cmp_eq_u32 s6, 0
2186 ; VI-NEXT: s_cselect_b32 s9, 0, s9
2187 ; VI-NEXT: s_cselect_b32 s8, 5, s8
2188 ; VI-NEXT: s_cmp_eq_u32 s6, 2
2189 ; VI-NEXT: s_cselect_b32 s5, 0, s5
2190 ; VI-NEXT: s_cselect_b32 s4, 5, s4
2191 ; VI-NEXT: v_mov_b32_e32 v0, s4
2192 ; VI-NEXT: v_mov_b32_e32 v1, s5
2193 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2194 ; VI-NEXT: v_mov_b32_e32 v0, s8
2195 ; VI-NEXT: v_mov_b32_e32 v1, s9
2196 ; VI-NEXT: v_mov_b32_e32 v2, s10
2197 ; VI-NEXT: v_mov_b32_e32 v3, s7
2198 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2200 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
2201 store <3 x i64> %vecins, ptr addrspace(1) %out, align 32
2205 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
2206 ; SI-LABEL: dynamic_insertelement_v4f64:
2208 ; SI-NEXT: s_load_dword s6, s[4:5], 0x10
2209 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
2210 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2211 ; SI-NEXT: s_mov_b32 s3, 0x100f000
2212 ; SI-NEXT: s_mov_b32 s2, -1
2213 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2214 ; SI-NEXT: s_cmp_eq_u32 s6, 1
2215 ; SI-NEXT: s_cselect_b32 s4, 0x40200000, s11
2216 ; SI-NEXT: s_cselect_b32 s5, 0, s10
2217 ; SI-NEXT: s_cmp_eq_u32 s6, 0
2218 ; SI-NEXT: s_cselect_b32 s7, 0x40200000, s9
2219 ; SI-NEXT: s_cselect_b32 s8, 0, s8
2220 ; SI-NEXT: s_cmp_eq_u32 s6, 3
2221 ; SI-NEXT: s_cselect_b32 s9, 0x40200000, s15
2222 ; SI-NEXT: s_cselect_b32 s10, 0, s14
2223 ; SI-NEXT: s_cmp_eq_u32 s6, 2
2224 ; SI-NEXT: s_cselect_b32 s6, 0x40200000, s13
2225 ; SI-NEXT: s_cselect_b32 s11, 0, s12
2226 ; SI-NEXT: v_mov_b32_e32 v0, s11
2227 ; SI-NEXT: v_mov_b32_e32 v1, s6
2228 ; SI-NEXT: v_mov_b32_e32 v2, s10
2229 ; SI-NEXT: v_mov_b32_e32 v3, s9
2230 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2232 ; SI-NEXT: v_mov_b32_e32 v0, s8
2233 ; SI-NEXT: v_mov_b32_e32 v1, s7
2234 ; SI-NEXT: v_mov_b32_e32 v2, s5
2235 ; SI-NEXT: v_mov_b32_e32 v3, s4
2236 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2239 ; VI-LABEL: dynamic_insertelement_v4f64:
2241 ; VI-NEXT: s_load_dword s6, s[4:5], 0x40
2242 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
2243 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2244 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2245 ; VI-NEXT: s_mov_b32 s2, -1
2246 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2247 ; VI-NEXT: s_cmp_eq_u32 s6, 1
2248 ; VI-NEXT: s_cselect_b32 s4, 0x40200000, s11
2249 ; VI-NEXT: s_cselect_b32 s5, 0, s10
2250 ; VI-NEXT: s_cmp_eq_u32 s6, 0
2251 ; VI-NEXT: s_cselect_b32 s7, 0x40200000, s9
2252 ; VI-NEXT: s_cselect_b32 s8, 0, s8
2253 ; VI-NEXT: s_cmp_eq_u32 s6, 3
2254 ; VI-NEXT: s_cselect_b32 s9, 0x40200000, s15
2255 ; VI-NEXT: s_cselect_b32 s10, 0, s14
2256 ; VI-NEXT: s_cmp_eq_u32 s6, 2
2257 ; VI-NEXT: s_cselect_b32 s6, 0x40200000, s13
2258 ; VI-NEXT: s_cselect_b32 s11, 0, s12
2259 ; VI-NEXT: v_mov_b32_e32 v0, s11
2260 ; VI-NEXT: v_mov_b32_e32 v1, s6
2261 ; VI-NEXT: v_mov_b32_e32 v2, s10
2262 ; VI-NEXT: v_mov_b32_e32 v3, s9
2263 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
2265 ; VI-NEXT: v_mov_b32_e32 v0, s8
2266 ; VI-NEXT: v_mov_b32_e32 v1, s7
2267 ; VI-NEXT: v_mov_b32_e32 v2, s5
2268 ; VI-NEXT: v_mov_b32_e32 v3, s4
2269 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2271 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
2272 store <4 x double> %vecins, ptr addrspace(1) %out, align 16
2276 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
2277 ; SI-LABEL: dynamic_insertelement_v8f64:
2279 ; SI-NEXT: s_load_dword s6, s[4:5], 0x20
2280 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
2281 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2282 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000
2283 ; SI-NEXT: s_mov_b32 s3, 0x100f000
2284 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2285 ; SI-NEXT: s_lshl_b32 s4, s6, 1
2286 ; SI-NEXT: v_mov_b32_e32 v0, s8
2287 ; SI-NEXT: v_mov_b32_e32 v1, s9
2288 ; SI-NEXT: v_mov_b32_e32 v2, s10
2289 ; SI-NEXT: v_mov_b32_e32 v3, s11
2290 ; SI-NEXT: v_mov_b32_e32 v4, s12
2291 ; SI-NEXT: v_mov_b32_e32 v5, s13
2292 ; SI-NEXT: v_mov_b32_e32 v6, s14
2293 ; SI-NEXT: v_mov_b32_e32 v7, s15
2294 ; SI-NEXT: v_mov_b32_e32 v8, s16
2295 ; SI-NEXT: v_mov_b32_e32 v9, s17
2296 ; SI-NEXT: v_mov_b32_e32 v10, s18
2297 ; SI-NEXT: v_mov_b32_e32 v11, s19
2298 ; SI-NEXT: v_mov_b32_e32 v12, s20
2299 ; SI-NEXT: v_mov_b32_e32 v13, s21
2300 ; SI-NEXT: v_mov_b32_e32 v14, s22
2301 ; SI-NEXT: v_mov_b32_e32 v15, s23
2302 ; SI-NEXT: s_mov_b32 m0, s4
2303 ; SI-NEXT: v_movreld_b32_e32 v0, 0
2304 ; SI-NEXT: s_mov_b32 s2, -1
2305 ; SI-NEXT: v_movreld_b32_e32 v1, v16
2306 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2307 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2308 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2309 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2312 ; VI-LABEL: dynamic_insertelement_v8f64:
2314 ; VI-NEXT: s_load_dword s6, s[4:5], 0x80
2315 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
2316 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2317 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000
2318 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2319 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2320 ; VI-NEXT: s_lshl_b32 s4, s6, 1
2321 ; VI-NEXT: v_mov_b32_e32 v0, s8
2322 ; VI-NEXT: v_mov_b32_e32 v1, s9
2323 ; VI-NEXT: v_mov_b32_e32 v2, s10
2324 ; VI-NEXT: v_mov_b32_e32 v3, s11
2325 ; VI-NEXT: v_mov_b32_e32 v4, s12
2326 ; VI-NEXT: v_mov_b32_e32 v5, s13
2327 ; VI-NEXT: v_mov_b32_e32 v6, s14
2328 ; VI-NEXT: v_mov_b32_e32 v7, s15
2329 ; VI-NEXT: v_mov_b32_e32 v8, s16
2330 ; VI-NEXT: v_mov_b32_e32 v9, s17
2331 ; VI-NEXT: v_mov_b32_e32 v10, s18
2332 ; VI-NEXT: v_mov_b32_e32 v11, s19
2333 ; VI-NEXT: v_mov_b32_e32 v12, s20
2334 ; VI-NEXT: v_mov_b32_e32 v13, s21
2335 ; VI-NEXT: v_mov_b32_e32 v14, s22
2336 ; VI-NEXT: v_mov_b32_e32 v15, s23
2337 ; VI-NEXT: s_mov_b32 m0, s4
2338 ; VI-NEXT: v_movreld_b32_e32 v0, 0
2339 ; VI-NEXT: s_mov_b32 s2, -1
2340 ; VI-NEXT: v_movreld_b32_e32 v1, v16
2341 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2342 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2343 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2344 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2346 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
2347 store <8 x double> %vecins, ptr addrspace(1) %out, align 16
2351 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
2353 attributes #0 = { nounwind }
2354 attributes #1 = { nounwind readnone }