1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
5 ; FIXME: Broken on evergreen
6 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7 ; individual elements instead of 128-bit stores.
10 ; FIXME: Why is the constant moved into the intermediate register and
11 ; not just directly into the vector component?
12 define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
13 ; SI-LABEL: insertelement_v4f32_0:
15 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
17 ; SI-NEXT: s_mov_b32 s8, 0x40a00000
18 ; SI-NEXT: s_mov_b32 s3, 0x100f000
19 ; SI-NEXT: s_mov_b32 s2, -1
20 ; SI-NEXT: s_waitcnt lgkmcnt(0)
21 ; SI-NEXT: v_mov_b32_e32 v0, s4
22 ; SI-NEXT: v_mov_b32_e32 v1, s5
23 ; SI-NEXT: v_mov_b32_e32 v2, s6
24 ; SI-NEXT: v_mov_b32_e32 v3, s7
25 ; SI-NEXT: v_mov_b32_e32 v0, s8
26 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
29 ; VI-LABEL: insertelement_v4f32_0:
31 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
32 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
33 ; VI-NEXT: s_mov_b32 s8, 0x40a00000
34 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
35 ; VI-NEXT: s_mov_b32 s2, -1
36 ; VI-NEXT: s_waitcnt lgkmcnt(0)
37 ; VI-NEXT: v_mov_b32_e32 v0, s4
38 ; VI-NEXT: v_mov_b32_e32 v1, s5
39 ; VI-NEXT: v_mov_b32_e32 v2, s6
40 ; VI-NEXT: v_mov_b32_e32 v3, s7
41 ; VI-NEXT: v_mov_b32_e32 v0, s8
42 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
44 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
45 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
49 define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
50 ; SI-LABEL: insertelement_v4f32_1:
52 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
53 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
54 ; SI-NEXT: s_mov_b32 s8, 0x40a00000
55 ; SI-NEXT: s_mov_b32 s3, 0x100f000
56 ; SI-NEXT: s_mov_b32 s2, -1
57 ; SI-NEXT: s_waitcnt lgkmcnt(0)
58 ; SI-NEXT: v_mov_b32_e32 v0, s4
59 ; SI-NEXT: v_mov_b32_e32 v1, s5
60 ; SI-NEXT: v_mov_b32_e32 v2, s6
61 ; SI-NEXT: v_mov_b32_e32 v3, s7
62 ; SI-NEXT: v_mov_b32_e32 v1, s8
63 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
66 ; VI-LABEL: insertelement_v4f32_1:
68 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
69 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
70 ; VI-NEXT: s_mov_b32 s8, 0x40a00000
71 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
72 ; VI-NEXT: s_mov_b32 s2, -1
73 ; VI-NEXT: s_waitcnt lgkmcnt(0)
74 ; VI-NEXT: v_mov_b32_e32 v0, s4
75 ; VI-NEXT: v_mov_b32_e32 v1, s5
76 ; VI-NEXT: v_mov_b32_e32 v2, s6
77 ; VI-NEXT: v_mov_b32_e32 v3, s7
78 ; VI-NEXT: v_mov_b32_e32 v1, s8
79 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
81 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
82 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
86 define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
87 ; SI-LABEL: insertelement_v4f32_2:
89 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
90 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
91 ; SI-NEXT: s_mov_b32 s8, 0x40a00000
92 ; SI-NEXT: s_mov_b32 s3, 0x100f000
93 ; SI-NEXT: s_mov_b32 s2, -1
94 ; SI-NEXT: s_waitcnt lgkmcnt(0)
95 ; SI-NEXT: v_mov_b32_e32 v0, s4
96 ; SI-NEXT: v_mov_b32_e32 v2, s6
97 ; SI-NEXT: v_mov_b32_e32 v1, s5
98 ; SI-NEXT: v_mov_b32_e32 v3, s7
99 ; SI-NEXT: v_mov_b32_e32 v2, s8
100 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
103 ; VI-LABEL: insertelement_v4f32_2:
105 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
106 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
107 ; VI-NEXT: s_mov_b32 s8, 0x40a00000
108 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
109 ; VI-NEXT: s_mov_b32 s2, -1
110 ; VI-NEXT: s_waitcnt lgkmcnt(0)
111 ; VI-NEXT: v_mov_b32_e32 v0, s4
112 ; VI-NEXT: v_mov_b32_e32 v2, s6
113 ; VI-NEXT: v_mov_b32_e32 v1, s5
114 ; VI-NEXT: v_mov_b32_e32 v3, s7
115 ; VI-NEXT: v_mov_b32_e32 v2, s8
116 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
118 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
119 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
123 define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
124 ; SI-LABEL: insertelement_v4f32_3:
126 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
127 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
128 ; SI-NEXT: s_mov_b32 s8, 0x40a00000
129 ; SI-NEXT: s_mov_b32 s3, 0x100f000
130 ; SI-NEXT: s_mov_b32 s2, -1
131 ; SI-NEXT: s_waitcnt lgkmcnt(0)
132 ; SI-NEXT: v_mov_b32_e32 v0, s4
133 ; SI-NEXT: v_mov_b32_e32 v3, s7
134 ; SI-NEXT: v_mov_b32_e32 v1, s5
135 ; SI-NEXT: v_mov_b32_e32 v2, s6
136 ; SI-NEXT: v_mov_b32_e32 v3, s8
137 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
140 ; VI-LABEL: insertelement_v4f32_3:
142 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
143 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
144 ; VI-NEXT: s_mov_b32 s8, 0x40a00000
145 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
146 ; VI-NEXT: s_mov_b32 s2, -1
147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
148 ; VI-NEXT: v_mov_b32_e32 v0, s4
149 ; VI-NEXT: v_mov_b32_e32 v3, s7
150 ; VI-NEXT: v_mov_b32_e32 v1, s5
151 ; VI-NEXT: v_mov_b32_e32 v2, s6
152 ; VI-NEXT: v_mov_b32_e32 v3, s8
153 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
155 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
156 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
160 define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
161 ; SI-LABEL: insertelement_v4i32_0:
163 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
164 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
165 ; SI-NEXT: s_waitcnt lgkmcnt(0)
166 ; SI-NEXT: s_movk_i32 s4, 0x3e7
167 ; SI-NEXT: v_mov_b32_e32 v0, s4
168 ; SI-NEXT: s_mov_b32 s3, 0x100f000
169 ; SI-NEXT: s_mov_b32 s2, -1
170 ; SI-NEXT: v_mov_b32_e32 v1, s5
171 ; SI-NEXT: v_mov_b32_e32 v2, s6
172 ; SI-NEXT: v_mov_b32_e32 v3, s7
173 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
176 ; VI-LABEL: insertelement_v4i32_0:
178 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
179 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
180 ; VI-NEXT: s_waitcnt lgkmcnt(0)
181 ; VI-NEXT: s_movk_i32 s4, 0x3e7
182 ; VI-NEXT: v_mov_b32_e32 v0, s4
183 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
184 ; VI-NEXT: s_mov_b32 s2, -1
185 ; VI-NEXT: v_mov_b32_e32 v1, s5
186 ; VI-NEXT: v_mov_b32_e32 v2, s6
187 ; VI-NEXT: v_mov_b32_e32 v3, s7
188 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
190 %vecins = insertelement <4 x i32> %a, i32 999, i32 0
191 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
195 define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
196 ; SI-LABEL: insertelement_v3f32_1:
198 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
199 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
200 ; SI-NEXT: s_mov_b32 s3, 0x100f000
201 ; SI-NEXT: s_mov_b32 s2, -1
202 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
203 ; SI-NEXT: s_waitcnt lgkmcnt(0)
204 ; SI-NEXT: v_mov_b32_e32 v2, s6
205 ; SI-NEXT: v_mov_b32_e32 v0, s4
206 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
209 ; VI-LABEL: insertelement_v3f32_1:
211 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
212 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
213 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
214 ; VI-NEXT: s_mov_b32 s2, -1
215 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
216 ; VI-NEXT: s_waitcnt lgkmcnt(0)
217 ; VI-NEXT: v_mov_b32_e32 v2, s6
218 ; VI-NEXT: v_mov_b32_e32 v0, s4
219 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
221 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
222 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
226 define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
227 ; SI-LABEL: insertelement_v3f32_2:
229 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
230 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
231 ; SI-NEXT: s_mov_b32 s3, 0x100f000
232 ; SI-NEXT: s_mov_b32 s2, -1
233 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
234 ; SI-NEXT: s_waitcnt lgkmcnt(0)
235 ; SI-NEXT: v_mov_b32_e32 v1, s5
236 ; SI-NEXT: v_mov_b32_e32 v0, s4
237 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
240 ; VI-LABEL: insertelement_v3f32_2:
242 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
243 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
244 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
245 ; VI-NEXT: s_mov_b32 s2, -1
246 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
247 ; VI-NEXT: s_waitcnt lgkmcnt(0)
248 ; VI-NEXT: v_mov_b32_e32 v1, s5
249 ; VI-NEXT: v_mov_b32_e32 v0, s4
250 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
252 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
253 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
257 define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
258 ; GCN-LABEL: insertelement_v3f32_3:
261 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
262 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
266 define <4 x float> @insertelement_to_sgpr() nounwind {
267 ; GCN-LABEL: insertelement_to_sgpr:
269 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
271 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
272 ; GCN-NEXT: s_mov_b32 s12, 0
273 ; GCN-NEXT: s_mov_b32 s4, s12
274 ; GCN-NEXT: s_mov_b32 s5, s12
275 ; GCN-NEXT: s_mov_b32 s6, s12
276 ; GCN-NEXT: s_mov_b32 s7, s12
277 ; GCN-NEXT: s_mov_b32 s8, s12
278 ; GCN-NEXT: s_mov_b32 s9, s12
279 ; GCN-NEXT: s_mov_b32 s10, s12
280 ; GCN-NEXT: s_mov_b32 s11, s12
281 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
282 ; GCN-NEXT: s_waitcnt vmcnt(0)
283 ; GCN-NEXT: s_setpc_b64 s[30:31]
284 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
285 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
286 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
287 ret <4 x float> %tmp2
290 define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
291 ; SI-LABEL: dynamic_insertelement_v2f32:
293 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
294 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
295 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
296 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
297 ; SI-NEXT: s_mov_b32 s3, 0x100f000
298 ; SI-NEXT: s_mov_b32 s2, -1
299 ; SI-NEXT: s_waitcnt lgkmcnt(0)
300 ; SI-NEXT: v_mov_b32_e32 v0, s7
301 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
302 ; SI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
303 ; SI-NEXT: v_mov_b32_e32 v0, s6
304 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
305 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
306 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
309 ; VI-LABEL: dynamic_insertelement_v2f32:
311 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
312 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
313 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
314 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
315 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
316 ; VI-NEXT: s_mov_b32 s2, -1
317 ; VI-NEXT: s_waitcnt lgkmcnt(0)
318 ; VI-NEXT: v_mov_b32_e32 v0, s7
319 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
320 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
321 ; VI-NEXT: v_mov_b32_e32 v0, s6
322 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
323 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
324 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
326 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
327 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
331 define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
332 ; SI-LABEL: dynamic_insertelement_v3f32:
334 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
335 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
336 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
337 ; SI-NEXT: v_mov_b32_e32 v3, 0x40a00000
338 ; SI-NEXT: s_mov_b32 s3, 0x100f000
339 ; SI-NEXT: s_mov_b32 s2, -1
340 ; SI-NEXT: s_waitcnt lgkmcnt(0)
341 ; SI-NEXT: v_mov_b32_e32 v2, s10
342 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
343 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
344 ; SI-NEXT: v_mov_b32_e32 v1, s9
345 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
346 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
347 ; SI-NEXT: v_mov_b32_e32 v0, s8
348 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
349 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
350 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
353 ; VI-LABEL: dynamic_insertelement_v3f32:
355 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
356 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
357 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
358 ; VI-NEXT: v_mov_b32_e32 v3, 0x40a00000
359 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
360 ; VI-NEXT: s_mov_b32 s2, -1
361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
362 ; VI-NEXT: v_mov_b32_e32 v2, s10
363 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
364 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
365 ; VI-NEXT: v_mov_b32_e32 v1, s9
366 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
367 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
368 ; VI-NEXT: v_mov_b32_e32 v0, s8
369 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
370 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
371 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
373 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
374 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
378 define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
379 ; SI-LABEL: dynamic_insertelement_v4f32:
381 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
382 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
383 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
384 ; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000
385 ; SI-NEXT: s_mov_b32 s3, 0x100f000
386 ; SI-NEXT: s_mov_b32 s2, -1
387 ; SI-NEXT: s_waitcnt lgkmcnt(0)
388 ; SI-NEXT: v_mov_b32_e32 v0, s11
389 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
390 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
391 ; SI-NEXT: v_mov_b32_e32 v0, s10
392 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
393 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
394 ; SI-NEXT: v_mov_b32_e32 v0, s9
395 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
396 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
397 ; SI-NEXT: v_mov_b32_e32 v0, s8
398 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
399 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
400 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
403 ; VI-LABEL: dynamic_insertelement_v4f32:
405 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
406 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
407 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
408 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000
409 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
410 ; VI-NEXT: s_mov_b32 s2, -1
411 ; VI-NEXT: s_waitcnt lgkmcnt(0)
412 ; VI-NEXT: v_mov_b32_e32 v0, s11
413 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
414 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
415 ; VI-NEXT: v_mov_b32_e32 v0, s10
416 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
417 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
418 ; VI-NEXT: v_mov_b32_e32 v0, s9
419 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
420 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
421 ; VI-NEXT: v_mov_b32_e32 v0, s8
422 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
423 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
424 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
426 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
427 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
431 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
432 ; SI-LABEL: dynamic_insertelement_v8f32:
434 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
435 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
436 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
437 ; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000
438 ; SI-NEXT: s_mov_b32 s3, 0x100f000
439 ; SI-NEXT: s_mov_b32 s2, -1
440 ; SI-NEXT: s_waitcnt lgkmcnt(0)
441 ; SI-NEXT: v_mov_b32_e32 v0, s11
442 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
443 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
444 ; SI-NEXT: v_mov_b32_e32 v0, s10
445 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
446 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
447 ; SI-NEXT: v_mov_b32_e32 v0, s9
448 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
449 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
450 ; SI-NEXT: v_mov_b32_e32 v0, s8
451 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
452 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
453 ; SI-NEXT: v_mov_b32_e32 v5, s15
454 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
455 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
456 ; SI-NEXT: v_mov_b32_e32 v5, s14
457 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
458 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
459 ; SI-NEXT: v_mov_b32_e32 v5, s13
460 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
461 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
462 ; SI-NEXT: v_mov_b32_e32 v8, s12
463 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
464 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
465 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
466 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
469 ; VI-LABEL: dynamic_insertelement_v8f32:
471 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
472 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
473 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
474 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000
475 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
476 ; VI-NEXT: s_mov_b32 s2, -1
477 ; VI-NEXT: s_waitcnt lgkmcnt(0)
478 ; VI-NEXT: v_mov_b32_e32 v0, s11
479 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
480 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
481 ; VI-NEXT: v_mov_b32_e32 v0, s10
482 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
483 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
484 ; VI-NEXT: v_mov_b32_e32 v0, s9
485 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
486 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
487 ; VI-NEXT: v_mov_b32_e32 v0, s8
488 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
489 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
490 ; VI-NEXT: v_mov_b32_e32 v5, s15
491 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
492 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
493 ; VI-NEXT: v_mov_b32_e32 v5, s14
494 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
495 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
496 ; VI-NEXT: v_mov_b32_e32 v5, s13
497 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
498 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
499 ; VI-NEXT: v_mov_b32_e32 v8, s12
500 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
501 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
502 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
503 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
505 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
506 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
510 define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
511 ; SI-LABEL: dynamic_insertelement_v16f32:
513 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
514 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
515 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
516 ; SI-NEXT: s_mov_b32 s3, 0x100f000
517 ; SI-NEXT: s_mov_b32 s2, -1
518 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000
519 ; SI-NEXT: s_waitcnt lgkmcnt(0)
520 ; SI-NEXT: v_mov_b32_e32 v0, s8
521 ; SI-NEXT: v_mov_b32_e32 v1, s9
522 ; SI-NEXT: v_mov_b32_e32 v2, s10
523 ; SI-NEXT: v_mov_b32_e32 v3, s11
524 ; SI-NEXT: v_mov_b32_e32 v4, s12
525 ; SI-NEXT: v_mov_b32_e32 v5, s13
526 ; SI-NEXT: v_mov_b32_e32 v6, s14
527 ; SI-NEXT: v_mov_b32_e32 v7, s15
528 ; SI-NEXT: v_mov_b32_e32 v8, s16
529 ; SI-NEXT: v_mov_b32_e32 v9, s17
530 ; SI-NEXT: v_mov_b32_e32 v10, s18
531 ; SI-NEXT: v_mov_b32_e32 v11, s19
532 ; SI-NEXT: v_mov_b32_e32 v12, s20
533 ; SI-NEXT: v_mov_b32_e32 v13, s21
534 ; SI-NEXT: v_mov_b32_e32 v14, s22
535 ; SI-NEXT: v_mov_b32_e32 v15, s23
536 ; SI-NEXT: s_mov_b32 m0, s4
537 ; SI-NEXT: v_movreld_b32_e32 v0, v16
538 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
539 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
540 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
541 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
544 ; VI-LABEL: dynamic_insertelement_v16f32:
546 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
547 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
548 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
549 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
550 ; VI-NEXT: s_mov_b32 s2, -1
551 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000
552 ; VI-NEXT: s_waitcnt lgkmcnt(0)
553 ; VI-NEXT: v_mov_b32_e32 v0, s8
554 ; VI-NEXT: v_mov_b32_e32 v1, s9
555 ; VI-NEXT: v_mov_b32_e32 v2, s10
556 ; VI-NEXT: v_mov_b32_e32 v3, s11
557 ; VI-NEXT: v_mov_b32_e32 v4, s12
558 ; VI-NEXT: v_mov_b32_e32 v5, s13
559 ; VI-NEXT: v_mov_b32_e32 v6, s14
560 ; VI-NEXT: v_mov_b32_e32 v7, s15
561 ; VI-NEXT: v_mov_b32_e32 v8, s16
562 ; VI-NEXT: v_mov_b32_e32 v9, s17
563 ; VI-NEXT: v_mov_b32_e32 v10, s18
564 ; VI-NEXT: v_mov_b32_e32 v11, s19
565 ; VI-NEXT: v_mov_b32_e32 v12, s20
566 ; VI-NEXT: v_mov_b32_e32 v13, s21
567 ; VI-NEXT: v_mov_b32_e32 v14, s22
568 ; VI-NEXT: v_mov_b32_e32 v15, s23
569 ; VI-NEXT: s_mov_b32 m0, s4
570 ; VI-NEXT: v_movreld_b32_e32 v0, v16
571 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
572 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
573 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
574 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
576 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
577 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
581 define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
582 ; SI-LABEL: dynamic_insertelement_v2i32:
584 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
585 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
586 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
587 ; SI-NEXT: s_mov_b32 s3, 0x100f000
588 ; SI-NEXT: s_mov_b32 s2, -1
589 ; SI-NEXT: s_waitcnt lgkmcnt(0)
590 ; SI-NEXT: v_mov_b32_e32 v0, s7
591 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
592 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
593 ; SI-NEXT: v_mov_b32_e32 v0, s6
594 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
595 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
596 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
599 ; VI-LABEL: dynamic_insertelement_v2i32:
601 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
602 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
603 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
604 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
605 ; VI-NEXT: s_mov_b32 s2, -1
606 ; VI-NEXT: s_waitcnt lgkmcnt(0)
607 ; VI-NEXT: v_mov_b32_e32 v0, s7
608 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
609 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
610 ; VI-NEXT: v_mov_b32_e32 v0, s6
611 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
612 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
613 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
615 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
616 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
620 define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
621 ; SI-LABEL: dynamic_insertelement_v3i32:
623 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
624 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
625 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
626 ; SI-NEXT: s_mov_b32 s3, 0x100f000
627 ; SI-NEXT: s_mov_b32 s2, -1
628 ; SI-NEXT: s_waitcnt lgkmcnt(0)
629 ; SI-NEXT: v_mov_b32_e32 v0, s10
630 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
631 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
632 ; SI-NEXT: v_mov_b32_e32 v0, s9
633 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
634 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
635 ; SI-NEXT: v_mov_b32_e32 v0, s8
636 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
637 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
638 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
641 ; VI-LABEL: dynamic_insertelement_v3i32:
643 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
644 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
645 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
646 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
647 ; VI-NEXT: s_mov_b32 s2, -1
648 ; VI-NEXT: s_waitcnt lgkmcnt(0)
649 ; VI-NEXT: v_mov_b32_e32 v0, s10
650 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
651 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
652 ; VI-NEXT: v_mov_b32_e32 v0, s9
653 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
654 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
655 ; VI-NEXT: v_mov_b32_e32 v0, s8
656 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
657 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
658 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
660 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
661 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
665 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
666 ; SI-LABEL: dynamic_insertelement_v4i32:
668 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
669 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
670 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
671 ; SI-NEXT: s_load_dword s4, s[4:5], 0x11
672 ; SI-NEXT: s_mov_b32 s3, 0x100f000
673 ; SI-NEXT: s_mov_b32 s2, -1
674 ; SI-NEXT: s_waitcnt lgkmcnt(0)
675 ; SI-NEXT: v_mov_b32_e32 v0, s11
676 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
677 ; SI-NEXT: v_mov_b32_e32 v4, s4
678 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
679 ; SI-NEXT: v_mov_b32_e32 v0, s10
680 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
681 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
682 ; SI-NEXT: v_mov_b32_e32 v0, s9
683 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
684 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
685 ; SI-NEXT: v_mov_b32_e32 v0, s8
686 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
687 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
688 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
691 ; VI-LABEL: dynamic_insertelement_v4i32:
693 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
694 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
695 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
696 ; VI-NEXT: s_load_dword s4, s[4:5], 0x44
697 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
698 ; VI-NEXT: s_mov_b32 s2, -1
699 ; VI-NEXT: s_waitcnt lgkmcnt(0)
700 ; VI-NEXT: v_mov_b32_e32 v0, s11
701 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
702 ; VI-NEXT: v_mov_b32_e32 v4, s4
703 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
704 ; VI-NEXT: v_mov_b32_e32 v0, s10
705 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
706 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
707 ; VI-NEXT: v_mov_b32_e32 v0, s9
708 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
709 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
710 ; VI-NEXT: v_mov_b32_e32 v0, s8
711 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
712 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
713 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
715 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
716 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
720 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
721 ; SI-LABEL: dynamic_insertelement_v8i32:
723 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
724 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
725 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
726 ; SI-NEXT: s_mov_b32 s3, 0x100f000
727 ; SI-NEXT: s_mov_b32 s2, -1
728 ; SI-NEXT: s_waitcnt lgkmcnt(0)
729 ; SI-NEXT: v_mov_b32_e32 v0, s11
730 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
731 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
732 ; SI-NEXT: v_mov_b32_e32 v0, s10
733 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
734 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
735 ; SI-NEXT: v_mov_b32_e32 v0, s9
736 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
737 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
738 ; SI-NEXT: v_mov_b32_e32 v0, s8
739 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
740 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
741 ; SI-NEXT: v_mov_b32_e32 v4, s15
742 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
743 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
744 ; SI-NEXT: v_mov_b32_e32 v4, s14
745 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
746 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
747 ; SI-NEXT: v_mov_b32_e32 v4, s13
748 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
749 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
750 ; SI-NEXT: v_mov_b32_e32 v4, s12
751 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
752 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
753 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
754 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
757 ; VI-LABEL: dynamic_insertelement_v8i32:
759 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
760 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
761 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
762 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
763 ; VI-NEXT: s_mov_b32 s2, -1
764 ; VI-NEXT: s_waitcnt lgkmcnt(0)
765 ; VI-NEXT: v_mov_b32_e32 v0, s11
766 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
767 ; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
768 ; VI-NEXT: v_mov_b32_e32 v0, s10
769 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
770 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
771 ; VI-NEXT: v_mov_b32_e32 v0, s9
772 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
773 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
774 ; VI-NEXT: v_mov_b32_e32 v0, s8
775 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
776 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
777 ; VI-NEXT: v_mov_b32_e32 v4, s15
778 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
779 ; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
780 ; VI-NEXT: v_mov_b32_e32 v4, s14
781 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
782 ; VI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
783 ; VI-NEXT: v_mov_b32_e32 v4, s13
784 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
785 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
786 ; VI-NEXT: v_mov_b32_e32 v4, s12
787 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
788 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
789 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
790 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
792 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
793 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
797 define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
798 ; SI-LABEL: dynamic_insertelement_v16i32:
800 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
801 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
802 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
803 ; SI-NEXT: s_mov_b32 s3, 0x100f000
804 ; SI-NEXT: s_mov_b32 s2, -1
805 ; SI-NEXT: s_waitcnt lgkmcnt(0)
806 ; SI-NEXT: v_mov_b32_e32 v0, s8
807 ; SI-NEXT: v_mov_b32_e32 v1, s9
808 ; SI-NEXT: v_mov_b32_e32 v2, s10
809 ; SI-NEXT: v_mov_b32_e32 v3, s11
810 ; SI-NEXT: v_mov_b32_e32 v4, s12
811 ; SI-NEXT: v_mov_b32_e32 v5, s13
812 ; SI-NEXT: v_mov_b32_e32 v6, s14
813 ; SI-NEXT: v_mov_b32_e32 v7, s15
814 ; SI-NEXT: v_mov_b32_e32 v8, s16
815 ; SI-NEXT: v_mov_b32_e32 v9, s17
816 ; SI-NEXT: v_mov_b32_e32 v10, s18
817 ; SI-NEXT: v_mov_b32_e32 v11, s19
818 ; SI-NEXT: v_mov_b32_e32 v12, s20
819 ; SI-NEXT: v_mov_b32_e32 v13, s21
820 ; SI-NEXT: v_mov_b32_e32 v14, s22
821 ; SI-NEXT: v_mov_b32_e32 v15, s23
822 ; SI-NEXT: s_mov_b32 m0, s4
823 ; SI-NEXT: v_movreld_b32_e32 v0, 5
824 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
825 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
826 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
827 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
830 ; VI-LABEL: dynamic_insertelement_v16i32:
832 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
833 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
834 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
835 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
836 ; VI-NEXT: s_mov_b32 s2, -1
837 ; VI-NEXT: s_waitcnt lgkmcnt(0)
838 ; VI-NEXT: v_mov_b32_e32 v0, s8
839 ; VI-NEXT: v_mov_b32_e32 v1, s9
840 ; VI-NEXT: v_mov_b32_e32 v2, s10
841 ; VI-NEXT: v_mov_b32_e32 v3, s11
842 ; VI-NEXT: v_mov_b32_e32 v4, s12
843 ; VI-NEXT: v_mov_b32_e32 v5, s13
844 ; VI-NEXT: v_mov_b32_e32 v6, s14
845 ; VI-NEXT: v_mov_b32_e32 v7, s15
846 ; VI-NEXT: v_mov_b32_e32 v8, s16
847 ; VI-NEXT: v_mov_b32_e32 v9, s17
848 ; VI-NEXT: v_mov_b32_e32 v10, s18
849 ; VI-NEXT: v_mov_b32_e32 v11, s19
850 ; VI-NEXT: v_mov_b32_e32 v12, s20
851 ; VI-NEXT: v_mov_b32_e32 v13, s21
852 ; VI-NEXT: v_mov_b32_e32 v14, s22
853 ; VI-NEXT: v_mov_b32_e32 v15, s23
854 ; VI-NEXT: s_mov_b32 m0, s4
855 ; VI-NEXT: v_movreld_b32_e32 v0, 5
856 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
857 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
858 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
859 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
861 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
862 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
866 define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
867 ; SI-LABEL: dynamic_insertelement_v2i16:
869 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
870 ; SI-NEXT: s_load_dword s6, s[4:5], 0x2
871 ; SI-NEXT: s_load_dword s4, s[4:5], 0x3
872 ; SI-NEXT: v_mov_b32_e32 v0, 0x50005
873 ; SI-NEXT: s_mov_b32 s3, 0x100f000
874 ; SI-NEXT: s_mov_b32 s2, -1
875 ; SI-NEXT: s_waitcnt lgkmcnt(0)
876 ; SI-NEXT: v_mov_b32_e32 v1, s6
877 ; SI-NEXT: s_lshl_b32 s4, s4, 4
878 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
879 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
880 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
883 ; VI-LABEL: dynamic_insertelement_v2i16:
885 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
886 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8
887 ; VI-NEXT: s_load_dword s4, s[4:5], 0xc
888 ; VI-NEXT: v_mov_b32_e32 v0, 0x50005
889 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
890 ; VI-NEXT: s_mov_b32 s2, -1
891 ; VI-NEXT: s_waitcnt lgkmcnt(0)
892 ; VI-NEXT: v_mov_b32_e32 v1, s6
893 ; VI-NEXT: s_lshl_b32 s4, s4, 4
894 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
895 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
896 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
898 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
899 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
903 define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
904 ; SI-LABEL: dynamic_insertelement_v3i16:
906 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
907 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
908 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
909 ; SI-NEXT: s_mov_b32 s5, 0
910 ; SI-NEXT: s_mov_b32 s3, 0x100f000
911 ; SI-NEXT: s_mov_b32 s2, -1
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: s_lshl_b32 s8, s4, 4
914 ; SI-NEXT: s_mov_b32 s4, 0xffff
915 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
916 ; SI-NEXT: s_mov_b32 s8, 0x50005
917 ; SI-NEXT: s_and_b32 s9, s5, s8
918 ; SI-NEXT: s_and_b32 s8, s4, s8
919 ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
920 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
921 ; SI-NEXT: v_mov_b32_e32 v0, s5
922 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
923 ; SI-NEXT: v_mov_b32_e32 v0, s4
924 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
927 ; VI-LABEL: dynamic_insertelement_v3i16:
929 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
930 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
931 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
932 ; VI-NEXT: s_mov_b32 s5, 0
933 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
934 ; VI-NEXT: s_mov_b32 s2, -1
935 ; VI-NEXT: s_waitcnt lgkmcnt(0)
936 ; VI-NEXT: v_mov_b32_e32 v1, s7
937 ; VI-NEXT: s_lshl_b32 s8, s4, 4
938 ; VI-NEXT: s_mov_b32 s4, 0xffff
939 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
940 ; VI-NEXT: s_mov_b32 s8, 0x50005
941 ; VI-NEXT: v_mov_b32_e32 v0, s8
942 ; VI-NEXT: v_bfi_b32 v0, s5, v0, v1
943 ; VI-NEXT: v_mov_b32_e32 v1, s8
944 ; VI-NEXT: v_mov_b32_e32 v2, s6
945 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
946 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
947 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
949 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
950 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
954 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
955 ; SI-LABEL: dynamic_insertelement_v2i8:
957 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
958 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
959 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
960 ; SI-NEXT: v_mov_b32_e32 v0, 0x505
961 ; SI-NEXT: s_mov_b32 s3, 0x100f000
962 ; SI-NEXT: s_mov_b32 s2, -1
963 ; SI-NEXT: s_waitcnt lgkmcnt(0)
964 ; SI-NEXT: v_mov_b32_e32 v1, s6
965 ; SI-NEXT: s_lshl_b32 s4, s4, 3
966 ; SI-NEXT: s_lshl_b32 s4, -1, s4
967 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
968 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
971 ; VI-LABEL: dynamic_insertelement_v2i8:
973 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
974 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
975 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
976 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
977 ; VI-NEXT: s_mov_b32 s2, -1
978 ; VI-NEXT: s_waitcnt lgkmcnt(0)
979 ; VI-NEXT: s_lshl_b32 s4, s4, 3
980 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
981 ; VI-NEXT: v_and_b32_e32 v1, 0x505, v0
982 ; VI-NEXT: v_xor_b32_e32 v0, -1, v0
983 ; VI-NEXT: v_and_b32_e32 v0, s6, v0
984 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
985 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
987 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
988 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
992 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
993 ; isTypeDesirableForOp in SimplifyDemandedBits
994 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
995 ; SI-LABEL: dynamic_insertelement_v3i8:
997 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
998 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
999 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1000 ; SI-NEXT: v_mov_b32_e32 v0, 0x5050505
1001 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1002 ; SI-NEXT: s_mov_b32 s2, -1
1003 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1004 ; SI-NEXT: v_mov_b32_e32 v1, s6
1005 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1006 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
1007 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
1008 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1009 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1010 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
1013 ; VI-LABEL: dynamic_insertelement_v3i8:
1015 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1016 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1017 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1018 ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505
1019 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1020 ; VI-NEXT: s_mov_b32 s2, -1
1021 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1022 ; VI-NEXT: v_mov_b32_e32 v1, s6
1023 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1024 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1025 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1026 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1027 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1028 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
1030 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1031 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1035 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1036 ; SI-LABEL: dynamic_insertelement_v4i8:
1038 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1039 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
1040 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1041 ; SI-NEXT: v_mov_b32_e32 v0, 0x5050505
1042 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1043 ; SI-NEXT: s_mov_b32 s2, -1
1044 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1045 ; SI-NEXT: v_mov_b32_e32 v1, s6
1046 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1047 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
1048 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
1049 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1052 ; VI-LABEL: dynamic_insertelement_v4i8:
1054 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1055 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1056 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1057 ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505
1058 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1059 ; VI-NEXT: s_mov_b32 s2, -1
1060 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1061 ; VI-NEXT: v_mov_b32_e32 v1, s6
1062 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1063 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1064 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1065 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1067 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1068 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1072 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1073 ; SI-LABEL: s_dynamic_insertelement_v8i8:
1075 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1076 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
1077 ; SI-NEXT: s_mov_b32 s7, 0
1078 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1079 ; SI-NEXT: s_mov_b32 s2, -1
1080 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1081 ; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1082 ; SI-NEXT: s_mov_b32 s0, s8
1083 ; SI-NEXT: s_lshl_b32 s8, s6, 3
1084 ; SI-NEXT: s_mov_b32 s6, 0xffff
1085 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1086 ; SI-NEXT: s_mov_b32 s8, 0x5050505
1087 ; SI-NEXT: s_mov_b32 s1, s9
1088 ; SI-NEXT: s_and_b32 s9, s7, s8
1089 ; SI-NEXT: s_and_b32 s8, s6, s8
1090 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1091 ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1092 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1093 ; SI-NEXT: v_mov_b32_e32 v0, s4
1094 ; SI-NEXT: v_mov_b32_e32 v1, s5
1095 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1098 ; VI-LABEL: s_dynamic_insertelement_v8i8:
1100 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1101 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
1102 ; VI-NEXT: s_mov_b32 s7, 0
1103 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1104 ; VI-NEXT: s_mov_b32 s2, -1
1105 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1106 ; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1107 ; VI-NEXT: s_mov_b32 s0, s8
1108 ; VI-NEXT: s_lshl_b32 s8, s6, 3
1109 ; VI-NEXT: s_mov_b32 s6, 0xffff
1110 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1111 ; VI-NEXT: s_mov_b32 s8, 0x5050505
1112 ; VI-NEXT: s_mov_b32 s1, s9
1113 ; VI-NEXT: s_and_b32 s9, s7, s8
1114 ; VI-NEXT: s_and_b32 s8, s6, s8
1115 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1116 ; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1117 ; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1118 ; VI-NEXT: v_mov_b32_e32 v0, s4
1119 ; VI-NEXT: v_mov_b32_e32 v1, s5
1120 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1122 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1123 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1124 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1128 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1129 ; SI-LABEL: dynamic_insertelement_v16i8:
1131 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1132 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1133 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
1134 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1135 ; SI-NEXT: s_mov_b32 s2, -1
1136 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1137 ; SI-NEXT: s_lshr_b32 s5, s11, 24
1138 ; SI-NEXT: v_mov_b32_e32 v0, s5
1139 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1140 ; SI-NEXT: s_lshr_b32 s5, s11, 16
1141 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1142 ; SI-NEXT: v_mov_b32_e32 v1, s5
1143 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1144 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1145 ; SI-NEXT: s_movk_i32 s5, 0xff
1146 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1147 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1148 ; SI-NEXT: s_lshr_b32 s6, s11, 8
1149 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1150 ; SI-NEXT: v_mov_b32_e32 v1, s6
1151 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1152 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1153 ; SI-NEXT: v_mov_b32_e32 v2, s11
1154 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1155 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1156 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1157 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1158 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1159 ; SI-NEXT: s_mov_b32 s6, 0xffff
1160 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1161 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1162 ; SI-NEXT: s_lshr_b32 s7, s10, 24
1163 ; SI-NEXT: v_or_b32_e32 v3, v1, v0
1164 ; SI-NEXT: v_mov_b32_e32 v0, s7
1165 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1166 ; SI-NEXT: s_lshr_b32 s7, s10, 16
1167 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1168 ; SI-NEXT: v_mov_b32_e32 v1, s7
1169 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1170 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1171 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1172 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1173 ; SI-NEXT: s_lshr_b32 s7, s10, 8
1174 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1175 ; SI-NEXT: v_mov_b32_e32 v1, s7
1176 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1177 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1178 ; SI-NEXT: v_mov_b32_e32 v2, s10
1179 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1180 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1181 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1182 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1183 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1184 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1185 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1186 ; SI-NEXT: s_lshr_b32 s7, s9, 24
1187 ; SI-NEXT: v_or_b32_e32 v2, v1, v0
1188 ; SI-NEXT: v_mov_b32_e32 v0, s7
1189 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1190 ; SI-NEXT: s_lshr_b32 s7, s9, 16
1191 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1192 ; SI-NEXT: v_mov_b32_e32 v1, s7
1193 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1194 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1195 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1196 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1197 ; SI-NEXT: s_lshr_b32 s7, s9, 8
1198 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1199 ; SI-NEXT: v_mov_b32_e32 v1, s7
1200 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1201 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1202 ; SI-NEXT: v_mov_b32_e32 v4, s9
1203 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1204 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1205 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1206 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1207 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
1208 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1209 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1210 ; SI-NEXT: s_lshr_b32 s7, s8, 24
1211 ; SI-NEXT: v_or_b32_e32 v1, v1, v0
1212 ; SI-NEXT: v_mov_b32_e32 v0, s7
1213 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1214 ; SI-NEXT: s_lshr_b32 s7, s8, 16
1215 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1216 ; SI-NEXT: v_mov_b32_e32 v4, s7
1217 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1218 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1219 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1220 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1221 ; SI-NEXT: s_lshr_b32 s7, s8, 8
1222 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1223 ; SI-NEXT: v_mov_b32_e32 v4, s7
1224 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1225 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1226 ; SI-NEXT: v_mov_b32_e32 v5, s8
1227 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1228 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1229 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
1230 ; SI-NEXT: v_and_b32_e32 v5, s5, v5
1231 ; SI-NEXT: v_or_b32_e32 v4, v5, v4
1232 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1233 ; SI-NEXT: v_and_b32_e32 v4, s6, v4
1234 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1235 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1238 ; VI-LABEL: dynamic_insertelement_v16i8:
1240 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1241 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1242 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
1243 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1244 ; VI-NEXT: s_mov_b32 s2, -1
1245 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1246 ; VI-NEXT: s_lshr_b32 s5, s11, 24
1247 ; VI-NEXT: v_mov_b32_e32 v0, s5
1248 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1249 ; VI-NEXT: s_lshr_b32 s5, s11, 16
1250 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1251 ; VI-NEXT: v_mov_b32_e32 v1, s5
1252 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1253 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1254 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1255 ; VI-NEXT: s_lshr_b32 s5, s11, 8
1256 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1257 ; VI-NEXT: v_mov_b32_e32 v1, s5
1258 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1259 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1260 ; VI-NEXT: v_mov_b32_e32 v2, s11
1261 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1262 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1263 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1264 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1265 ; VI-NEXT: s_lshr_b32 s5, s10, 24
1266 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1267 ; VI-NEXT: v_mov_b32_e32 v0, s5
1268 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1269 ; VI-NEXT: s_lshr_b32 s5, s10, 16
1270 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1271 ; VI-NEXT: v_mov_b32_e32 v1, s5
1272 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1273 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1274 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1275 ; VI-NEXT: s_lshr_b32 s5, s10, 8
1276 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1277 ; VI-NEXT: v_mov_b32_e32 v1, s5
1278 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1279 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1280 ; VI-NEXT: v_mov_b32_e32 v2, s10
1281 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1282 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1283 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1284 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1285 ; VI-NEXT: s_lshr_b32 s5, s9, 24
1286 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1287 ; VI-NEXT: v_mov_b32_e32 v0, s5
1288 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1289 ; VI-NEXT: s_lshr_b32 s5, s9, 16
1290 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1291 ; VI-NEXT: v_mov_b32_e32 v1, s5
1292 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1293 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1294 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1295 ; VI-NEXT: s_lshr_b32 s5, s9, 8
1296 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1297 ; VI-NEXT: v_mov_b32_e32 v1, s5
1298 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1299 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1300 ; VI-NEXT: v_mov_b32_e32 v4, s9
1301 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1302 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1303 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1304 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1305 ; VI-NEXT: s_lshr_b32 s5, s8, 24
1306 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1307 ; VI-NEXT: v_mov_b32_e32 v0, s5
1308 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1309 ; VI-NEXT: s_lshr_b32 s5, s8, 16
1310 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1311 ; VI-NEXT: v_mov_b32_e32 v4, s5
1312 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1313 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1314 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1315 ; VI-NEXT: s_lshr_b32 s5, s8, 8
1316 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1317 ; VI-NEXT: v_mov_b32_e32 v4, s5
1318 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1319 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1320 ; VI-NEXT: v_mov_b32_e32 v5, s8
1321 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1322 ; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1323 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1324 ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1325 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1326 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1328 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1329 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1333 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
1334 ; the compiler doesn't crash.
1335 define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1336 ; SI-LABEL: insert_split_bb:
1337 ; SI: ; %bb.0: ; %entry
1338 ; SI-NEXT: s_load_dword s0, s[4:5], 0x4
1339 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
1340 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1341 ; SI-NEXT: s_cmp_lg_u32 s0, 0
1342 ; SI-NEXT: s_cbranch_scc0 BB26_2
1343 ; SI-NEXT: ; %bb.1: ; %else
1344 ; SI-NEXT: s_load_dword s1, s[6:7], 0x1
1345 ; SI-NEXT: s_branch BB26_3
1346 ; SI-NEXT: BB26_2: ; %if
1347 ; SI-NEXT: s_load_dword s1, s[6:7], 0x0
1348 ; SI-NEXT: BB26_3: ; %endif
1349 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1350 ; SI-NEXT: v_mov_b32_e32 v0, s0
1351 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1352 ; SI-NEXT: s_mov_b32 s6, -1
1353 ; SI-NEXT: v_mov_b32_e32 v1, s1
1354 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1357 ; VI-LABEL: insert_split_bb:
1358 ; VI: ; %bb.0: ; %entry
1359 ; VI-NEXT: s_load_dword s0, s[4:5], 0x10
1360 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
1361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1362 ; VI-NEXT: s_cmp_lg_u32 s0, 0
1363 ; VI-NEXT: s_cbranch_scc0 BB26_2
1364 ; VI-NEXT: ; %bb.1: ; %else
1365 ; VI-NEXT: s_load_dword s1, s[6:7], 0x4
1366 ; VI-NEXT: s_branch BB26_3
1367 ; VI-NEXT: BB26_2: ; %if
1368 ; VI-NEXT: s_load_dword s1, s[6:7], 0x0
1369 ; VI-NEXT: BB26_3: ; %endif
1370 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1371 ; VI-NEXT: v_mov_b32_e32 v0, s0
1372 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1373 ; VI-NEXT: s_mov_b32 s6, -1
1374 ; VI-NEXT: v_mov_b32_e32 v1, s1
1375 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1378 %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1379 %1 = icmp eq i32 %a, 0
1380 br i1 %1, label %if, label %else
1383 %2 = load i32, i32 addrspace(1)* %in
1384 %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1388 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1389 %5 = load i32, i32 addrspace(1)* %4
1390 %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1394 %7 = phi <2 x i32> [%3, %if], [%6, %else]
1395 store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1399 define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1400 ; SI-LABEL: dynamic_insertelement_v2f64:
1402 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1403 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc
1404 ; SI-NEXT: s_load_dword s4, s[4:5], 0x18
1405 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
1406 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1407 ; SI-NEXT: s_mov_b32 s2, -1
1408 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1409 ; SI-NEXT: v_mov_b32_e32 v0, s11
1410 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1411 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1412 ; SI-NEXT: v_mov_b32_e32 v0, s10
1413 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1414 ; SI-NEXT: v_mov_b32_e32 v0, s9
1415 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1416 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1417 ; SI-NEXT: v_mov_b32_e32 v0, s8
1418 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1419 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1422 ; VI-LABEL: dynamic_insertelement_v2f64:
1424 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1425 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30
1426 ; VI-NEXT: s_load_dword s4, s[4:5], 0x60
1427 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
1428 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1429 ; VI-NEXT: s_mov_b32 s2, -1
1430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1431 ; VI-NEXT: v_mov_b32_e32 v0, s11
1432 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1433 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1434 ; VI-NEXT: v_mov_b32_e32 v0, s10
1435 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1436 ; VI-NEXT: v_mov_b32_e32 v0, s9
1437 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1438 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1439 ; VI-NEXT: v_mov_b32_e32 v0, s8
1440 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1441 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1443 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1444 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1448 define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1449 ; SI-LABEL: dynamic_insertelement_v2i64:
1451 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1452 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1453 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
1454 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1455 ; SI-NEXT: s_mov_b32 s2, -1
1456 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1457 ; SI-NEXT: v_mov_b32_e32 v0, s11
1458 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1459 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1460 ; SI-NEXT: v_mov_b32_e32 v0, s10
1461 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1462 ; SI-NEXT: v_mov_b32_e32 v0, s9
1463 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1464 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1465 ; SI-NEXT: v_mov_b32_e32 v0, s8
1466 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1467 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1470 ; VI-LABEL: dynamic_insertelement_v2i64:
1472 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1473 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1474 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
1475 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1476 ; VI-NEXT: s_mov_b32 s2, -1
1477 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1478 ; VI-NEXT: v_mov_b32_e32 v0, s11
1479 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1480 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1481 ; VI-NEXT: v_mov_b32_e32 v0, s10
1482 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1483 ; VI-NEXT: v_mov_b32_e32 v0, s9
1484 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1485 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1486 ; VI-NEXT: v_mov_b32_e32 v0, s8
1487 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1488 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1490 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1491 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1495 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1496 ; SI-LABEL: dynamic_insertelement_v3i64:
1498 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1499 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1500 ; SI-NEXT: s_load_dword s6, s[4:5], 0x10
1501 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1502 ; SI-NEXT: s_mov_b32 s2, -1
1503 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1504 ; SI-NEXT: v_mov_b32_e32 v0, s13
1505 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
1506 ; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1507 ; SI-NEXT: v_mov_b32_e32 v0, s12
1508 ; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1509 ; SI-NEXT: v_mov_b32_e32 v0, s11
1510 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1511 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1512 ; SI-NEXT: v_mov_b32_e32 v0, s10
1513 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1514 ; SI-NEXT: v_mov_b32_e32 v0, s9
1515 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1516 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1517 ; SI-NEXT: v_mov_b32_e32 v0, s8
1518 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1519 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1520 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1523 ; VI-LABEL: dynamic_insertelement_v3i64:
1525 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1526 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1527 ; VI-NEXT: s_load_dword s6, s[4:5], 0x40
1528 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1529 ; VI-NEXT: s_mov_b32 s2, -1
1530 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1531 ; VI-NEXT: v_mov_b32_e32 v0, s13
1532 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
1533 ; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1534 ; VI-NEXT: v_mov_b32_e32 v0, s12
1535 ; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1536 ; VI-NEXT: v_mov_b32_e32 v0, s11
1537 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1538 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1539 ; VI-NEXT: v_mov_b32_e32 v0, s10
1540 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1541 ; VI-NEXT: v_mov_b32_e32 v0, s9
1542 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1543 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1544 ; VI-NEXT: v_mov_b32_e32 v0, s8
1545 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1546 ; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1547 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1549 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1550 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1554 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1555 ; SI-LABEL: dynamic_insertelement_v4f64:
1557 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1558 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1559 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
1560 ; SI-NEXT: v_mov_b32_e32 v4, 0x40200000
1561 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1562 ; SI-NEXT: s_mov_b32 s2, -1
1563 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1564 ; SI-NEXT: v_mov_b32_e32 v0, s11
1565 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1566 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1567 ; SI-NEXT: v_mov_b32_e32 v0, s10
1568 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1569 ; SI-NEXT: v_mov_b32_e32 v0, s9
1570 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1571 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1572 ; SI-NEXT: v_mov_b32_e32 v0, s8
1573 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1574 ; SI-NEXT: v_mov_b32_e32 v5, s15
1575 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1576 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1577 ; SI-NEXT: v_mov_b32_e32 v5, s14
1578 ; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1579 ; SI-NEXT: v_mov_b32_e32 v5, s13
1580 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1581 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1582 ; SI-NEXT: v_mov_b32_e32 v4, s12
1583 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1584 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1585 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1588 ; VI-LABEL: dynamic_insertelement_v4f64:
1590 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1591 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1592 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
1593 ; VI-NEXT: v_mov_b32_e32 v4, 0x40200000
1594 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1595 ; VI-NEXT: s_mov_b32 s2, -1
1596 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1597 ; VI-NEXT: v_mov_b32_e32 v0, s11
1598 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1599 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1600 ; VI-NEXT: v_mov_b32_e32 v0, s10
1601 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1602 ; VI-NEXT: v_mov_b32_e32 v0, s9
1603 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1604 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1605 ; VI-NEXT: v_mov_b32_e32 v0, s8
1606 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1607 ; VI-NEXT: v_mov_b32_e32 v5, s15
1608 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1609 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1610 ; VI-NEXT: v_mov_b32_e32 v5, s14
1611 ; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1612 ; VI-NEXT: v_mov_b32_e32 v5, s13
1613 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1614 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1615 ; VI-NEXT: v_mov_b32_e32 v4, s12
1616 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1617 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1618 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1620 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1621 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1625 define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1626 ; SI-LABEL: dynamic_insertelement_v8f64:
1628 ; SI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0
1629 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
1630 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1631 ; SI-NEXT: v_mov_b32_e32 v16, 64
1632 ; SI-NEXT: s_mov_b32 s27, 0x100f000
1633 ; SI-NEXT: s_mov_b32 s26, -1
1634 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1635 ; SI-NEXT: v_mov_b32_e32 v0, s8
1636 ; SI-NEXT: s_and_b32 s4, s4, 7
1637 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1638 ; SI-NEXT: v_mov_b32_e32 v1, s9
1639 ; SI-NEXT: v_mov_b32_e32 v2, s10
1640 ; SI-NEXT: v_mov_b32_e32 v3, s11
1641 ; SI-NEXT: v_mov_b32_e32 v4, s12
1642 ; SI-NEXT: v_mov_b32_e32 v5, s13
1643 ; SI-NEXT: v_mov_b32_e32 v6, s14
1644 ; SI-NEXT: v_mov_b32_e32 v7, s15
1645 ; SI-NEXT: v_mov_b32_e32 v8, s16
1646 ; SI-NEXT: v_mov_b32_e32 v9, s17
1647 ; SI-NEXT: v_mov_b32_e32 v10, s18
1648 ; SI-NEXT: v_mov_b32_e32 v11, s19
1649 ; SI-NEXT: v_mov_b32_e32 v12, s20
1650 ; SI-NEXT: v_mov_b32_e32 v13, s21
1651 ; SI-NEXT: v_mov_b32_e32 v14, s22
1652 ; SI-NEXT: v_mov_b32_e32 v15, s23
1653 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1654 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1655 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1656 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1657 ; SI-NEXT: v_or_b32_e32 v16, s4, v16
1658 ; SI-NEXT: v_mov_b32_e32 v0, 0
1659 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
1660 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1661 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1662 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1663 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1664 ; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1665 ; SI-NEXT: s_waitcnt vmcnt(0)
1666 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1667 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1668 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1669 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1672 ; VI-LABEL: dynamic_insertelement_v8f64:
1674 ; VI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0
1675 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
1676 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1677 ; VI-NEXT: v_mov_b32_e32 v16, 64
1678 ; VI-NEXT: s_mov_b32 s27, 0x1100f000
1679 ; VI-NEXT: s_mov_b32 s26, -1
1680 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1681 ; VI-NEXT: v_mov_b32_e32 v0, s8
1682 ; VI-NEXT: s_and_b32 s4, s4, 7
1683 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1684 ; VI-NEXT: v_mov_b32_e32 v1, s9
1685 ; VI-NEXT: v_mov_b32_e32 v2, s10
1686 ; VI-NEXT: v_mov_b32_e32 v3, s11
1687 ; VI-NEXT: v_mov_b32_e32 v4, s12
1688 ; VI-NEXT: v_mov_b32_e32 v5, s13
1689 ; VI-NEXT: v_mov_b32_e32 v6, s14
1690 ; VI-NEXT: v_mov_b32_e32 v7, s15
1691 ; VI-NEXT: v_mov_b32_e32 v8, s16
1692 ; VI-NEXT: v_mov_b32_e32 v9, s17
1693 ; VI-NEXT: v_mov_b32_e32 v10, s18
1694 ; VI-NEXT: v_mov_b32_e32 v11, s19
1695 ; VI-NEXT: v_mov_b32_e32 v12, s20
1696 ; VI-NEXT: v_mov_b32_e32 v13, s21
1697 ; VI-NEXT: v_mov_b32_e32 v14, s22
1698 ; VI-NEXT: v_mov_b32_e32 v15, s23
1699 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1700 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1701 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1702 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1703 ; VI-NEXT: v_or_b32_e32 v16, s4, v16
1704 ; VI-NEXT: v_mov_b32_e32 v0, 0
1705 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
1706 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1707 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1708 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1709 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1710 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1711 ; VI-NEXT: s_waitcnt vmcnt(0)
1712 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1713 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1714 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1715 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1717 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1718 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1722 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1724 attributes #0 = { nounwind }
1725 attributes #1 = { nounwind readnone }