1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
5 ; FIXME: Broken on evergreen
6 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7 ; individual elements instead of 128-bit stores.
10 ; FIXME: Why is the constant moved into the intermediate register and
11 ; not just directly into the vector component?
12 define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
13 ; SI-LABEL: insertelement_v4f32_0:
15 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: s_mov_b32 s4, 0x40a00000
19 ; SI-NEXT: v_mov_b32_e32 v0, s4
20 ; SI-NEXT: s_mov_b32 s3, 0x100f000
21 ; SI-NEXT: s_mov_b32 s2, -1
22 ; SI-NEXT: v_mov_b32_e32 v1, s5
23 ; SI-NEXT: v_mov_b32_e32 v2, s6
24 ; SI-NEXT: v_mov_b32_e32 v3, s7
25 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
28 ; VI-LABEL: insertelement_v4f32_0:
30 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
31 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
32 ; VI-NEXT: s_waitcnt lgkmcnt(0)
33 ; VI-NEXT: s_mov_b32 s4, 0x40a00000
34 ; VI-NEXT: v_mov_b32_e32 v0, s4
35 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
36 ; VI-NEXT: s_mov_b32 s2, -1
37 ; VI-NEXT: v_mov_b32_e32 v1, s5
38 ; VI-NEXT: v_mov_b32_e32 v2, s6
39 ; VI-NEXT: v_mov_b32_e32 v3, s7
40 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
42 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
43 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
47 define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
48 ; SI-LABEL: insertelement_v4f32_1:
50 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
51 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
52 ; SI-NEXT: s_waitcnt lgkmcnt(0)
53 ; SI-NEXT: s_mov_b32 s5, 0x40a00000
54 ; SI-NEXT: s_mov_b32 s3, 0x100f000
55 ; SI-NEXT: s_mov_b32 s2, -1
56 ; SI-NEXT: v_mov_b32_e32 v0, s4
57 ; SI-NEXT: v_mov_b32_e32 v1, s5
58 ; SI-NEXT: v_mov_b32_e32 v2, s6
59 ; SI-NEXT: v_mov_b32_e32 v3, s7
60 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
63 ; VI-LABEL: insertelement_v4f32_1:
65 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
66 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
67 ; VI-NEXT: s_waitcnt lgkmcnt(0)
68 ; VI-NEXT: s_mov_b32 s5, 0x40a00000
69 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
70 ; VI-NEXT: s_mov_b32 s2, -1
71 ; VI-NEXT: v_mov_b32_e32 v0, s4
72 ; VI-NEXT: v_mov_b32_e32 v1, s5
73 ; VI-NEXT: v_mov_b32_e32 v2, s6
74 ; VI-NEXT: v_mov_b32_e32 v3, s7
75 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
77 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
78 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
82 define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
83 ; SI-LABEL: insertelement_v4f32_2:
85 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
86 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
87 ; SI-NEXT: s_waitcnt lgkmcnt(0)
88 ; SI-NEXT: s_mov_b32 s6, 0x40a00000
89 ; SI-NEXT: s_mov_b32 s3, 0x100f000
90 ; SI-NEXT: s_mov_b32 s2, -1
91 ; SI-NEXT: v_mov_b32_e32 v0, s4
92 ; SI-NEXT: v_mov_b32_e32 v1, s5
93 ; SI-NEXT: v_mov_b32_e32 v2, s6
94 ; SI-NEXT: v_mov_b32_e32 v3, s7
95 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
98 ; VI-LABEL: insertelement_v4f32_2:
100 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
101 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
102 ; VI-NEXT: s_waitcnt lgkmcnt(0)
103 ; VI-NEXT: s_mov_b32 s6, 0x40a00000
104 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
105 ; VI-NEXT: s_mov_b32 s2, -1
106 ; VI-NEXT: v_mov_b32_e32 v0, s4
107 ; VI-NEXT: v_mov_b32_e32 v1, s5
108 ; VI-NEXT: v_mov_b32_e32 v2, s6
109 ; VI-NEXT: v_mov_b32_e32 v3, s7
110 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
112 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
113 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
117 define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
118 ; SI-LABEL: insertelement_v4f32_3:
120 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
121 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
122 ; SI-NEXT: s_waitcnt lgkmcnt(0)
123 ; SI-NEXT: s_mov_b32 s7, 0x40a00000
124 ; SI-NEXT: s_mov_b32 s3, 0x100f000
125 ; SI-NEXT: s_mov_b32 s2, -1
126 ; SI-NEXT: v_mov_b32_e32 v0, s4
127 ; SI-NEXT: v_mov_b32_e32 v1, s5
128 ; SI-NEXT: v_mov_b32_e32 v2, s6
129 ; SI-NEXT: v_mov_b32_e32 v3, s7
130 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
133 ; VI-LABEL: insertelement_v4f32_3:
135 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
136 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
137 ; VI-NEXT: s_waitcnt lgkmcnt(0)
138 ; VI-NEXT: s_mov_b32 s7, 0x40a00000
139 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
140 ; VI-NEXT: s_mov_b32 s2, -1
141 ; VI-NEXT: v_mov_b32_e32 v0, s4
142 ; VI-NEXT: v_mov_b32_e32 v1, s5
143 ; VI-NEXT: v_mov_b32_e32 v2, s6
144 ; VI-NEXT: v_mov_b32_e32 v3, s7
145 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
147 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
148 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
152 define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
153 ; SI-LABEL: insertelement_v4i32_0:
155 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
156 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
157 ; SI-NEXT: s_waitcnt lgkmcnt(0)
158 ; SI-NEXT: s_movk_i32 s4, 0x3e7
159 ; SI-NEXT: v_mov_b32_e32 v0, s4
160 ; SI-NEXT: s_mov_b32 s3, 0x100f000
161 ; SI-NEXT: s_mov_b32 s2, -1
162 ; SI-NEXT: v_mov_b32_e32 v1, s5
163 ; SI-NEXT: v_mov_b32_e32 v2, s6
164 ; SI-NEXT: v_mov_b32_e32 v3, s7
165 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
168 ; VI-LABEL: insertelement_v4i32_0:
170 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
171 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
173 ; VI-NEXT: s_movk_i32 s4, 0x3e7
174 ; VI-NEXT: v_mov_b32_e32 v0, s4
175 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
176 ; VI-NEXT: s_mov_b32 s2, -1
177 ; VI-NEXT: v_mov_b32_e32 v1, s5
178 ; VI-NEXT: v_mov_b32_e32 v2, s6
179 ; VI-NEXT: v_mov_b32_e32 v3, s7
180 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
182 %vecins = insertelement <4 x i32> %a, i32 999, i32 0
183 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
187 define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
188 ; SI-LABEL: insertelement_v3f32_1:
190 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
191 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
192 ; SI-NEXT: s_mov_b32 s3, 0x100f000
193 ; SI-NEXT: s_mov_b32 s2, -1
194 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
195 ; SI-NEXT: s_waitcnt lgkmcnt(0)
196 ; SI-NEXT: v_mov_b32_e32 v0, s4
197 ; SI-NEXT: v_mov_b32_e32 v2, s6
198 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
201 ; VI-LABEL: insertelement_v3f32_1:
203 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
204 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
205 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
206 ; VI-NEXT: s_mov_b32 s2, -1
207 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
208 ; VI-NEXT: s_waitcnt lgkmcnt(0)
209 ; VI-NEXT: v_mov_b32_e32 v0, s4
210 ; VI-NEXT: v_mov_b32_e32 v2, s6
211 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
213 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
214 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
218 define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
219 ; SI-LABEL: insertelement_v3f32_2:
221 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
222 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
223 ; SI-NEXT: s_mov_b32 s3, 0x100f000
224 ; SI-NEXT: s_mov_b32 s2, -1
225 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
226 ; SI-NEXT: s_waitcnt lgkmcnt(0)
227 ; SI-NEXT: v_mov_b32_e32 v0, s4
228 ; SI-NEXT: v_mov_b32_e32 v1, s5
229 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
232 ; VI-LABEL: insertelement_v3f32_2:
234 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
235 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
236 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
237 ; VI-NEXT: s_mov_b32 s2, -1
238 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
239 ; VI-NEXT: s_waitcnt lgkmcnt(0)
240 ; VI-NEXT: v_mov_b32_e32 v0, s4
241 ; VI-NEXT: v_mov_b32_e32 v1, s5
242 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
244 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
245 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
249 define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
250 ; GCN-LABEL: insertelement_v3f32_3:
253 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
254 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
258 define <4 x float> @insertelement_to_sgpr() nounwind {
259 ; GCN-LABEL: insertelement_to_sgpr:
261 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
263 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
264 ; GCN-NEXT: s_mov_b32 s12, 0
265 ; GCN-NEXT: s_mov_b32 s4, s12
266 ; GCN-NEXT: s_mov_b32 s5, s12
267 ; GCN-NEXT: s_mov_b32 s6, s12
268 ; GCN-NEXT: s_mov_b32 s7, s12
269 ; GCN-NEXT: s_mov_b32 s8, s12
270 ; GCN-NEXT: s_mov_b32 s9, s12
271 ; GCN-NEXT: s_mov_b32 s10, s12
272 ; GCN-NEXT: s_mov_b32 s11, s12
273 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
274 ; GCN-NEXT: s_waitcnt vmcnt(0)
275 ; GCN-NEXT: s_setpc_b64 s[30:31]
276 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
277 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
278 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
279 ret <4 x float> %tmp2
282 define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
283 ; SI-LABEL: dynamic_insertelement_v2f32:
285 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
286 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
287 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
288 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
289 ; SI-NEXT: s_mov_b32 s3, 0x100f000
290 ; SI-NEXT: s_mov_b32 s2, -1
291 ; SI-NEXT: s_waitcnt lgkmcnt(0)
292 ; SI-NEXT: v_mov_b32_e32 v1, s7
293 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
294 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
295 ; SI-NEXT: v_mov_b32_e32 v2, s6
296 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
297 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
298 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
301 ; VI-LABEL: dynamic_insertelement_v2f32:
303 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
304 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
305 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
306 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
307 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
308 ; VI-NEXT: s_mov_b32 s2, -1
309 ; VI-NEXT: s_waitcnt lgkmcnt(0)
310 ; VI-NEXT: v_mov_b32_e32 v1, s7
311 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
312 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
313 ; VI-NEXT: v_mov_b32_e32 v2, s6
314 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
315 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
316 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
318 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
319 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
323 define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
324 ; SI-LABEL: dynamic_insertelement_v3f32:
326 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
327 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
328 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
329 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
330 ; SI-NEXT: s_mov_b32 s3, 0x100f000
331 ; SI-NEXT: s_mov_b32 s2, -1
332 ; SI-NEXT: s_waitcnt lgkmcnt(0)
333 ; SI-NEXT: v_mov_b32_e32 v1, s10
334 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
335 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
336 ; SI-NEXT: v_mov_b32_e32 v1, s9
337 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
338 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
339 ; SI-NEXT: v_mov_b32_e32 v3, s8
340 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
341 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
342 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
345 ; VI-LABEL: dynamic_insertelement_v3f32:
347 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
348 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
349 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
350 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
351 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
352 ; VI-NEXT: s_mov_b32 s2, -1
353 ; VI-NEXT: s_waitcnt lgkmcnt(0)
354 ; VI-NEXT: v_mov_b32_e32 v1, s10
355 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
356 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
357 ; VI-NEXT: v_mov_b32_e32 v1, s9
358 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
359 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
360 ; VI-NEXT: v_mov_b32_e32 v3, s8
361 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
362 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
363 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
365 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
366 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
370 define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
371 ; SI-LABEL: dynamic_insertelement_v4f32:
373 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
374 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
375 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
376 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
377 ; SI-NEXT: s_mov_b32 s3, 0x100f000
378 ; SI-NEXT: s_mov_b32 s2, -1
379 ; SI-NEXT: s_waitcnt lgkmcnt(0)
380 ; SI-NEXT: v_mov_b32_e32 v1, s11
381 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
382 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
383 ; SI-NEXT: v_mov_b32_e32 v1, s10
384 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
385 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
386 ; SI-NEXT: v_mov_b32_e32 v1, s9
387 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
388 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
389 ; SI-NEXT: v_mov_b32_e32 v4, s8
390 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
391 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
392 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
395 ; VI-LABEL: dynamic_insertelement_v4f32:
397 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
398 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
399 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
400 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
401 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
402 ; VI-NEXT: s_mov_b32 s2, -1
403 ; VI-NEXT: s_waitcnt lgkmcnt(0)
404 ; VI-NEXT: v_mov_b32_e32 v1, s11
405 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
406 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
407 ; VI-NEXT: v_mov_b32_e32 v1, s10
408 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
409 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
410 ; VI-NEXT: v_mov_b32_e32 v1, s9
411 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
412 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
413 ; VI-NEXT: v_mov_b32_e32 v4, s8
414 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
415 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
416 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
418 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
419 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
423 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
424 ; SI-LABEL: dynamic_insertelement_v8f32:
426 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
427 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
428 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
429 ; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000
430 ; SI-NEXT: s_mov_b32 s3, 0x100f000
431 ; SI-NEXT: s_mov_b32 s2, -1
432 ; SI-NEXT: s_waitcnt lgkmcnt(0)
433 ; SI-NEXT: v_mov_b32_e32 v0, s11
434 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
435 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
436 ; SI-NEXT: v_mov_b32_e32 v0, s10
437 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
438 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
439 ; SI-NEXT: v_mov_b32_e32 v0, s9
440 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
441 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
442 ; SI-NEXT: v_mov_b32_e32 v0, s8
443 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
444 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
445 ; SI-NEXT: v_mov_b32_e32 v5, s15
446 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
447 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
448 ; SI-NEXT: v_mov_b32_e32 v5, s14
449 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
450 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
451 ; SI-NEXT: v_mov_b32_e32 v5, s13
452 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
453 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
454 ; SI-NEXT: v_mov_b32_e32 v8, s12
455 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
456 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
457 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
458 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
461 ; VI-LABEL: dynamic_insertelement_v8f32:
463 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
464 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
465 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
466 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000
467 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
468 ; VI-NEXT: s_mov_b32 s2, -1
469 ; VI-NEXT: s_waitcnt lgkmcnt(0)
470 ; VI-NEXT: v_mov_b32_e32 v0, s11
471 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
472 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
473 ; VI-NEXT: v_mov_b32_e32 v0, s10
474 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
475 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
476 ; VI-NEXT: v_mov_b32_e32 v0, s9
477 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
478 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
479 ; VI-NEXT: v_mov_b32_e32 v0, s8
480 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
481 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
482 ; VI-NEXT: v_mov_b32_e32 v5, s15
483 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
484 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
485 ; VI-NEXT: v_mov_b32_e32 v5, s14
486 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
487 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
488 ; VI-NEXT: v_mov_b32_e32 v5, s13
489 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
490 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
491 ; VI-NEXT: v_mov_b32_e32 v8, s12
492 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
493 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
494 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
495 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
497 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
498 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
502 define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
503 ; SI-LABEL: dynamic_insertelement_v16f32:
505 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
506 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
507 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
508 ; SI-NEXT: s_mov_b32 s3, 0x100f000
509 ; SI-NEXT: s_mov_b32 s2, -1
510 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000
511 ; SI-NEXT: s_waitcnt lgkmcnt(0)
512 ; SI-NEXT: v_mov_b32_e32 v0, s8
513 ; SI-NEXT: v_mov_b32_e32 v1, s9
514 ; SI-NEXT: v_mov_b32_e32 v2, s10
515 ; SI-NEXT: v_mov_b32_e32 v3, s11
516 ; SI-NEXT: v_mov_b32_e32 v4, s12
517 ; SI-NEXT: v_mov_b32_e32 v5, s13
518 ; SI-NEXT: v_mov_b32_e32 v6, s14
519 ; SI-NEXT: v_mov_b32_e32 v7, s15
520 ; SI-NEXT: v_mov_b32_e32 v8, s16
521 ; SI-NEXT: v_mov_b32_e32 v9, s17
522 ; SI-NEXT: v_mov_b32_e32 v10, s18
523 ; SI-NEXT: v_mov_b32_e32 v11, s19
524 ; SI-NEXT: v_mov_b32_e32 v12, s20
525 ; SI-NEXT: v_mov_b32_e32 v13, s21
526 ; SI-NEXT: v_mov_b32_e32 v14, s22
527 ; SI-NEXT: v_mov_b32_e32 v15, s23
528 ; SI-NEXT: s_mov_b32 m0, s4
529 ; SI-NEXT: v_movreld_b32_e32 v0, v16
530 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
531 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
532 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
533 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
536 ; VI-LABEL: dynamic_insertelement_v16f32:
538 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
539 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
540 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
541 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
542 ; VI-NEXT: s_mov_b32 s2, -1
543 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000
544 ; VI-NEXT: s_waitcnt lgkmcnt(0)
545 ; VI-NEXT: v_mov_b32_e32 v0, s8
546 ; VI-NEXT: v_mov_b32_e32 v1, s9
547 ; VI-NEXT: v_mov_b32_e32 v2, s10
548 ; VI-NEXT: v_mov_b32_e32 v3, s11
549 ; VI-NEXT: v_mov_b32_e32 v4, s12
550 ; VI-NEXT: v_mov_b32_e32 v5, s13
551 ; VI-NEXT: v_mov_b32_e32 v6, s14
552 ; VI-NEXT: v_mov_b32_e32 v7, s15
553 ; VI-NEXT: v_mov_b32_e32 v8, s16
554 ; VI-NEXT: v_mov_b32_e32 v9, s17
555 ; VI-NEXT: v_mov_b32_e32 v10, s18
556 ; VI-NEXT: v_mov_b32_e32 v11, s19
557 ; VI-NEXT: v_mov_b32_e32 v12, s20
558 ; VI-NEXT: v_mov_b32_e32 v13, s21
559 ; VI-NEXT: v_mov_b32_e32 v14, s22
560 ; VI-NEXT: v_mov_b32_e32 v15, s23
561 ; VI-NEXT: s_mov_b32 m0, s4
562 ; VI-NEXT: v_movreld_b32_e32 v0, v16
563 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
564 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
565 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
566 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
568 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
569 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
573 define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
574 ; SI-LABEL: dynamic_insertelement_v2i32:
576 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
577 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
578 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
579 ; SI-NEXT: s_mov_b32 s3, 0x100f000
580 ; SI-NEXT: s_mov_b32 s2, -1
581 ; SI-NEXT: s_waitcnt lgkmcnt(0)
582 ; SI-NEXT: v_mov_b32_e32 v0, s7
583 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
584 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
585 ; SI-NEXT: v_mov_b32_e32 v0, s6
586 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
587 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
588 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
591 ; VI-LABEL: dynamic_insertelement_v2i32:
593 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
594 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
595 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
596 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
597 ; VI-NEXT: s_mov_b32 s2, -1
598 ; VI-NEXT: s_waitcnt lgkmcnt(0)
599 ; VI-NEXT: v_mov_b32_e32 v0, s7
600 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
601 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
602 ; VI-NEXT: v_mov_b32_e32 v0, s6
603 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
604 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
605 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
607 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
608 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
612 define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
613 ; SI-LABEL: dynamic_insertelement_v3i32:
615 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
616 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
617 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
618 ; SI-NEXT: s_mov_b32 s3, 0x100f000
619 ; SI-NEXT: s_mov_b32 s2, -1
620 ; SI-NEXT: s_waitcnt lgkmcnt(0)
621 ; SI-NEXT: v_mov_b32_e32 v0, s10
622 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
623 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
624 ; SI-NEXT: v_mov_b32_e32 v0, s9
625 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
626 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
627 ; SI-NEXT: v_mov_b32_e32 v0, s8
628 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
629 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
630 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
633 ; VI-LABEL: dynamic_insertelement_v3i32:
635 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
636 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
637 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
638 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
639 ; VI-NEXT: s_mov_b32 s2, -1
640 ; VI-NEXT: s_waitcnt lgkmcnt(0)
641 ; VI-NEXT: v_mov_b32_e32 v0, s10
642 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
643 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
644 ; VI-NEXT: v_mov_b32_e32 v0, s9
645 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
646 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
647 ; VI-NEXT: v_mov_b32_e32 v0, s8
648 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
649 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
650 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
652 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
653 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
657 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
658 ; SI-LABEL: dynamic_insertelement_v4i32:
660 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
661 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
662 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
663 ; SI-NEXT: s_load_dword s4, s[4:5], 0x11
664 ; SI-NEXT: s_mov_b32 s3, 0x100f000
665 ; SI-NEXT: s_mov_b32 s2, -1
666 ; SI-NEXT: s_waitcnt lgkmcnt(0)
667 ; SI-NEXT: v_mov_b32_e32 v0, s11
668 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
669 ; SI-NEXT: v_mov_b32_e32 v4, s4
670 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
671 ; SI-NEXT: v_mov_b32_e32 v0, s10
672 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
673 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
674 ; SI-NEXT: v_mov_b32_e32 v0, s9
675 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
676 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
677 ; SI-NEXT: v_mov_b32_e32 v0, s8
678 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
679 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
680 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
683 ; VI-LABEL: dynamic_insertelement_v4i32:
685 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
686 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
687 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
688 ; VI-NEXT: s_load_dword s4, s[4:5], 0x44
689 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
690 ; VI-NEXT: s_mov_b32 s2, -1
691 ; VI-NEXT: s_waitcnt lgkmcnt(0)
692 ; VI-NEXT: v_mov_b32_e32 v0, s11
693 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
694 ; VI-NEXT: v_mov_b32_e32 v4, s4
695 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
696 ; VI-NEXT: v_mov_b32_e32 v0, s10
697 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
698 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
699 ; VI-NEXT: v_mov_b32_e32 v0, s9
700 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
701 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
702 ; VI-NEXT: v_mov_b32_e32 v0, s8
703 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
704 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
705 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
707 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
708 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
712 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
713 ; SI-LABEL: dynamic_insertelement_v8i32:
715 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
716 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
717 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
718 ; SI-NEXT: s_mov_b32 s3, 0x100f000
719 ; SI-NEXT: s_mov_b32 s2, -1
720 ; SI-NEXT: s_waitcnt lgkmcnt(0)
721 ; SI-NEXT: v_mov_b32_e32 v0, s11
722 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
723 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
724 ; SI-NEXT: v_mov_b32_e32 v0, s10
725 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
726 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
727 ; SI-NEXT: v_mov_b32_e32 v0, s9
728 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
729 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
730 ; SI-NEXT: v_mov_b32_e32 v0, s8
731 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
732 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
733 ; SI-NEXT: v_mov_b32_e32 v4, s15
734 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
735 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
736 ; SI-NEXT: v_mov_b32_e32 v4, s14
737 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
738 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
739 ; SI-NEXT: v_mov_b32_e32 v4, s13
740 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
741 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
742 ; SI-NEXT: v_mov_b32_e32 v4, s12
743 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
744 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
745 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
746 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
749 ; VI-LABEL: dynamic_insertelement_v8i32:
751 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
752 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
753 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
754 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
755 ; VI-NEXT: s_mov_b32 s2, -1
756 ; VI-NEXT: s_waitcnt lgkmcnt(0)
757 ; VI-NEXT: v_mov_b32_e32 v0, s11
758 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
759 ; VI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
760 ; VI-NEXT: v_mov_b32_e32 v0, s10
761 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
762 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
763 ; VI-NEXT: v_mov_b32_e32 v0, s9
764 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
765 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
766 ; VI-NEXT: v_mov_b32_e32 v0, s8
767 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
768 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
769 ; VI-NEXT: v_mov_b32_e32 v4, s15
770 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
771 ; VI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
772 ; VI-NEXT: v_mov_b32_e32 v4, s14
773 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
774 ; VI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
775 ; VI-NEXT: v_mov_b32_e32 v4, s13
776 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
777 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
778 ; VI-NEXT: v_mov_b32_e32 v4, s12
779 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
780 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
781 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
782 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
784 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
785 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
789 define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
790 ; SI-LABEL: dynamic_insertelement_v16i32:
792 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
793 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
794 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
795 ; SI-NEXT: s_mov_b32 s3, 0x100f000
796 ; SI-NEXT: s_mov_b32 s2, -1
797 ; SI-NEXT: s_waitcnt lgkmcnt(0)
798 ; SI-NEXT: v_mov_b32_e32 v0, s8
799 ; SI-NEXT: v_mov_b32_e32 v1, s9
800 ; SI-NEXT: v_mov_b32_e32 v2, s10
801 ; SI-NEXT: v_mov_b32_e32 v3, s11
802 ; SI-NEXT: v_mov_b32_e32 v4, s12
803 ; SI-NEXT: v_mov_b32_e32 v5, s13
804 ; SI-NEXT: v_mov_b32_e32 v6, s14
805 ; SI-NEXT: v_mov_b32_e32 v7, s15
806 ; SI-NEXT: v_mov_b32_e32 v8, s16
807 ; SI-NEXT: v_mov_b32_e32 v9, s17
808 ; SI-NEXT: v_mov_b32_e32 v10, s18
809 ; SI-NEXT: v_mov_b32_e32 v11, s19
810 ; SI-NEXT: v_mov_b32_e32 v12, s20
811 ; SI-NEXT: v_mov_b32_e32 v13, s21
812 ; SI-NEXT: v_mov_b32_e32 v14, s22
813 ; SI-NEXT: v_mov_b32_e32 v15, s23
814 ; SI-NEXT: s_mov_b32 m0, s4
815 ; SI-NEXT: v_movreld_b32_e32 v0, 5
816 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
817 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
818 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
819 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
822 ; VI-LABEL: dynamic_insertelement_v16i32:
824 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
825 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
826 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
827 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
828 ; VI-NEXT: s_mov_b32 s2, -1
829 ; VI-NEXT: s_waitcnt lgkmcnt(0)
830 ; VI-NEXT: v_mov_b32_e32 v0, s8
831 ; VI-NEXT: v_mov_b32_e32 v1, s9
832 ; VI-NEXT: v_mov_b32_e32 v2, s10
833 ; VI-NEXT: v_mov_b32_e32 v3, s11
834 ; VI-NEXT: v_mov_b32_e32 v4, s12
835 ; VI-NEXT: v_mov_b32_e32 v5, s13
836 ; VI-NEXT: v_mov_b32_e32 v6, s14
837 ; VI-NEXT: v_mov_b32_e32 v7, s15
838 ; VI-NEXT: v_mov_b32_e32 v8, s16
839 ; VI-NEXT: v_mov_b32_e32 v9, s17
840 ; VI-NEXT: v_mov_b32_e32 v10, s18
841 ; VI-NEXT: v_mov_b32_e32 v11, s19
842 ; VI-NEXT: v_mov_b32_e32 v12, s20
843 ; VI-NEXT: v_mov_b32_e32 v13, s21
844 ; VI-NEXT: v_mov_b32_e32 v14, s22
845 ; VI-NEXT: v_mov_b32_e32 v15, s23
846 ; VI-NEXT: s_mov_b32 m0, s4
847 ; VI-NEXT: v_movreld_b32_e32 v0, 5
848 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
849 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
850 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
851 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
853 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
854 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
858 define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
859 ; SI-LABEL: dynamic_insertelement_v2i16:
861 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
862 ; SI-NEXT: s_load_dword s6, s[4:5], 0x2
863 ; SI-NEXT: s_load_dword s4, s[4:5], 0x3
864 ; SI-NEXT: v_mov_b32_e32 v0, 0x50005
865 ; SI-NEXT: s_mov_b32 s3, 0x100f000
866 ; SI-NEXT: s_mov_b32 s2, -1
867 ; SI-NEXT: s_waitcnt lgkmcnt(0)
868 ; SI-NEXT: v_mov_b32_e32 v1, s6
869 ; SI-NEXT: s_lshl_b32 s4, s4, 4
870 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
871 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
872 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
875 ; VI-LABEL: dynamic_insertelement_v2i16:
877 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
878 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8
879 ; VI-NEXT: s_load_dword s4, s[4:5], 0xc
880 ; VI-NEXT: v_mov_b32_e32 v0, 0x50005
881 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
882 ; VI-NEXT: s_mov_b32 s2, -1
883 ; VI-NEXT: s_waitcnt lgkmcnt(0)
884 ; VI-NEXT: v_mov_b32_e32 v1, s6
885 ; VI-NEXT: s_lshl_b32 s4, s4, 4
886 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
887 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
888 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
890 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
891 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
895 define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
896 ; SI-LABEL: dynamic_insertelement_v3i16:
898 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
899 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
900 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
901 ; SI-NEXT: s_mov_b32 s5, 0
902 ; SI-NEXT: s_mov_b32 s3, 0x100f000
903 ; SI-NEXT: s_mov_b32 s2, -1
904 ; SI-NEXT: s_waitcnt lgkmcnt(0)
905 ; SI-NEXT: s_lshl_b32 s8, s4, 4
906 ; SI-NEXT: s_mov_b32 s4, 0xffff
907 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
908 ; SI-NEXT: s_mov_b32 s8, 0x50005
909 ; SI-NEXT: s_and_b32 s9, s5, s8
910 ; SI-NEXT: s_and_b32 s8, s4, s8
911 ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
912 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
913 ; SI-NEXT: v_mov_b32_e32 v0, s5
914 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
915 ; SI-NEXT: v_mov_b32_e32 v0, s4
916 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
919 ; VI-LABEL: dynamic_insertelement_v3i16:
921 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
922 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
923 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
924 ; VI-NEXT: s_mov_b32 s5, 0
925 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
926 ; VI-NEXT: s_mov_b32 s2, -1
927 ; VI-NEXT: s_waitcnt lgkmcnt(0)
928 ; VI-NEXT: v_mov_b32_e32 v1, s7
929 ; VI-NEXT: s_lshl_b32 s8, s4, 4
930 ; VI-NEXT: s_mov_b32 s4, 0xffff
931 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
932 ; VI-NEXT: s_mov_b32 s8, 0x50005
933 ; VI-NEXT: v_mov_b32_e32 v0, s8
934 ; VI-NEXT: v_bfi_b32 v0, s5, v0, v1
935 ; VI-NEXT: v_mov_b32_e32 v1, s8
936 ; VI-NEXT: v_mov_b32_e32 v2, s6
937 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
938 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
939 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
941 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
942 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
946 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
947 ; SI-LABEL: dynamic_insertelement_v2i8:
949 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
950 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
951 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
952 ; SI-NEXT: v_mov_b32_e32 v0, 0x505
953 ; SI-NEXT: s_mov_b32 s3, 0x100f000
954 ; SI-NEXT: s_mov_b32 s2, -1
955 ; SI-NEXT: s_waitcnt lgkmcnt(0)
956 ; SI-NEXT: v_mov_b32_e32 v1, s6
957 ; SI-NEXT: s_lshl_b32 s4, s4, 3
958 ; SI-NEXT: s_lshl_b32 s4, -1, s4
959 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
960 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
963 ; VI-LABEL: dynamic_insertelement_v2i8:
965 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
966 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
967 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
968 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
969 ; VI-NEXT: s_mov_b32 s2, -1
970 ; VI-NEXT: s_waitcnt lgkmcnt(0)
971 ; VI-NEXT: s_lshl_b32 s4, s4, 3
972 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
973 ; VI-NEXT: v_and_b32_e32 v1, 0x505, v0
974 ; VI-NEXT: v_xor_b32_e32 v0, -1, v0
975 ; VI-NEXT: v_and_b32_e32 v0, s6, v0
976 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
977 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
979 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
980 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
984 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
985 ; isTypeDesirableForOp in SimplifyDemandedBits
986 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
987 ; SI-LABEL: dynamic_insertelement_v3i8:
989 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
990 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
991 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
992 ; SI-NEXT: v_mov_b32_e32 v0, 0x5050505
993 ; SI-NEXT: s_mov_b32 s3, 0x100f000
994 ; SI-NEXT: s_mov_b32 s2, -1
995 ; SI-NEXT: s_waitcnt lgkmcnt(0)
996 ; SI-NEXT: v_mov_b32_e32 v1, s6
997 ; SI-NEXT: s_lshl_b32 s4, s4, 3
998 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
999 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
1000 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1001 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1002 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
1005 ; VI-LABEL: dynamic_insertelement_v3i8:
1007 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1008 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1009 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1010 ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505
1011 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1012 ; VI-NEXT: s_mov_b32 s2, -1
1013 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1014 ; VI-NEXT: v_mov_b32_e32 v1, s6
1015 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1016 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1017 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1018 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1019 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1020 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
1022 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1023 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1027 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1028 ; SI-LABEL: dynamic_insertelement_v4i8:
1030 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1031 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
1032 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1033 ; SI-NEXT: v_mov_b32_e32 v0, 0x5050505
1034 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1035 ; SI-NEXT: s_mov_b32 s2, -1
1036 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1037 ; SI-NEXT: v_mov_b32_e32 v1, s6
1038 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1039 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
1040 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
1041 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1044 ; VI-LABEL: dynamic_insertelement_v4i8:
1046 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1047 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1048 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1049 ; VI-NEXT: v_mov_b32_e32 v0, 0x5050505
1050 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1051 ; VI-NEXT: s_mov_b32 s2, -1
1052 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1053 ; VI-NEXT: v_mov_b32_e32 v1, s6
1054 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1055 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1056 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1057 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1059 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1060 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1064 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1065 ; SI-LABEL: s_dynamic_insertelement_v8i8:
1067 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1068 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
1069 ; SI-NEXT: s_mov_b32 s7, 0
1070 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1071 ; SI-NEXT: s_mov_b32 s2, -1
1072 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1073 ; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1074 ; SI-NEXT: s_mov_b32 s0, s8
1075 ; SI-NEXT: s_lshl_b32 s8, s6, 3
1076 ; SI-NEXT: s_mov_b32 s6, 0xffff
1077 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1078 ; SI-NEXT: s_mov_b32 s8, 0x5050505
1079 ; SI-NEXT: s_mov_b32 s1, s9
1080 ; SI-NEXT: s_and_b32 s9, s7, s8
1081 ; SI-NEXT: s_and_b32 s8, s6, s8
1082 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1083 ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1084 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1085 ; SI-NEXT: v_mov_b32_e32 v0, s4
1086 ; SI-NEXT: v_mov_b32_e32 v1, s5
1087 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1090 ; VI-LABEL: s_dynamic_insertelement_v8i8:
1092 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1093 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
1094 ; VI-NEXT: s_mov_b32 s7, 0
1095 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1096 ; VI-NEXT: s_mov_b32 s2, -1
1097 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1098 ; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1099 ; VI-NEXT: s_mov_b32 s0, s8
1100 ; VI-NEXT: s_lshl_b32 s8, s6, 3
1101 ; VI-NEXT: s_mov_b32 s6, 0xffff
1102 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1103 ; VI-NEXT: s_mov_b32 s8, 0x5050505
1104 ; VI-NEXT: s_mov_b32 s1, s9
1105 ; VI-NEXT: s_and_b32 s9, s7, s8
1106 ; VI-NEXT: s_and_b32 s8, s6, s8
1107 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1108 ; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1109 ; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1110 ; VI-NEXT: v_mov_b32_e32 v0, s4
1111 ; VI-NEXT: v_mov_b32_e32 v1, s5
1112 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1114 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1115 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1116 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1120 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1121 ; SI-LABEL: dynamic_insertelement_v16i8:
1123 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1124 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1125 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
1126 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1127 ; SI-NEXT: s_mov_b32 s2, -1
1128 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1129 ; SI-NEXT: s_lshr_b32 s5, s11, 24
1130 ; SI-NEXT: v_mov_b32_e32 v0, s5
1131 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1132 ; SI-NEXT: s_lshr_b32 s5, s11, 16
1133 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1134 ; SI-NEXT: v_mov_b32_e32 v1, s5
1135 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1136 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1137 ; SI-NEXT: s_movk_i32 s5, 0xff
1138 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1139 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1140 ; SI-NEXT: s_lshr_b32 s6, s11, 8
1141 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1142 ; SI-NEXT: v_mov_b32_e32 v1, s6
1143 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1144 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1145 ; SI-NEXT: v_mov_b32_e32 v2, s11
1146 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1147 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1148 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1149 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1150 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1151 ; SI-NEXT: s_mov_b32 s6, 0xffff
1152 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1153 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1154 ; SI-NEXT: s_lshr_b32 s7, s10, 24
1155 ; SI-NEXT: v_or_b32_e32 v3, v1, v0
1156 ; SI-NEXT: v_mov_b32_e32 v0, s7
1157 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1158 ; SI-NEXT: s_lshr_b32 s7, s10, 16
1159 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1160 ; SI-NEXT: v_mov_b32_e32 v1, s7
1161 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1162 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1163 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1164 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1165 ; SI-NEXT: s_lshr_b32 s7, s10, 8
1166 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1167 ; SI-NEXT: v_mov_b32_e32 v1, s7
1168 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1169 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1170 ; SI-NEXT: v_mov_b32_e32 v2, s10
1171 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1172 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1173 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1174 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1175 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1176 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1177 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1178 ; SI-NEXT: s_lshr_b32 s7, s9, 24
1179 ; SI-NEXT: v_or_b32_e32 v2, v1, v0
1180 ; SI-NEXT: v_mov_b32_e32 v0, s7
1181 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1182 ; SI-NEXT: s_lshr_b32 s7, s9, 16
1183 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1184 ; SI-NEXT: v_mov_b32_e32 v1, s7
1185 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1186 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1187 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1188 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1189 ; SI-NEXT: s_lshr_b32 s7, s9, 8
1190 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1191 ; SI-NEXT: v_mov_b32_e32 v1, s7
1192 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1193 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1194 ; SI-NEXT: v_mov_b32_e32 v4, s9
1195 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1196 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1197 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1198 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1199 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
1200 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1201 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1202 ; SI-NEXT: s_lshr_b32 s7, s8, 24
1203 ; SI-NEXT: v_or_b32_e32 v1, v1, v0
1204 ; SI-NEXT: v_mov_b32_e32 v0, s7
1205 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1206 ; SI-NEXT: s_lshr_b32 s7, s8, 16
1207 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1208 ; SI-NEXT: v_mov_b32_e32 v4, s7
1209 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1210 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1211 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1212 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1213 ; SI-NEXT: s_lshr_b32 s7, s8, 8
1214 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1215 ; SI-NEXT: v_mov_b32_e32 v4, s7
1216 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1217 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1218 ; SI-NEXT: v_mov_b32_e32 v5, s8
1219 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1220 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1221 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
1222 ; SI-NEXT: v_and_b32_e32 v5, s5, v5
1223 ; SI-NEXT: v_or_b32_e32 v4, v5, v4
1224 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1225 ; SI-NEXT: v_and_b32_e32 v4, s6, v4
1226 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1227 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1230 ; VI-LABEL: dynamic_insertelement_v16i8:
1232 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1233 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1234 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
1235 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1236 ; VI-NEXT: s_mov_b32 s2, -1
1237 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1238 ; VI-NEXT: s_lshr_b32 s5, s11, 24
1239 ; VI-NEXT: v_mov_b32_e32 v0, s5
1240 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1241 ; VI-NEXT: s_lshr_b32 s5, s11, 16
1242 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1243 ; VI-NEXT: v_mov_b32_e32 v1, s5
1244 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1245 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1246 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1247 ; VI-NEXT: s_lshr_b32 s5, s11, 8
1248 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1249 ; VI-NEXT: v_mov_b32_e32 v1, s5
1250 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1251 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1252 ; VI-NEXT: v_mov_b32_e32 v2, s11
1253 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1254 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1255 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1256 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1257 ; VI-NEXT: s_lshr_b32 s5, s10, 24
1258 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1259 ; VI-NEXT: v_mov_b32_e32 v0, s5
1260 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1261 ; VI-NEXT: s_lshr_b32 s5, s10, 16
1262 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1263 ; VI-NEXT: v_mov_b32_e32 v1, s5
1264 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1265 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1266 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1267 ; VI-NEXT: s_lshr_b32 s5, s10, 8
1268 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1269 ; VI-NEXT: v_mov_b32_e32 v1, s5
1270 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1271 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1272 ; VI-NEXT: v_mov_b32_e32 v2, s10
1273 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1274 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1275 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1276 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1277 ; VI-NEXT: s_lshr_b32 s5, s9, 24
1278 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1279 ; VI-NEXT: v_mov_b32_e32 v0, s5
1280 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1281 ; VI-NEXT: s_lshr_b32 s5, s9, 16
1282 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1283 ; VI-NEXT: v_mov_b32_e32 v1, s5
1284 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1285 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1286 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1287 ; VI-NEXT: s_lshr_b32 s5, s9, 8
1288 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1289 ; VI-NEXT: v_mov_b32_e32 v1, s5
1290 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1291 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1292 ; VI-NEXT: v_mov_b32_e32 v4, s9
1293 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1294 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1295 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1296 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1297 ; VI-NEXT: s_lshr_b32 s5, s8, 24
1298 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1299 ; VI-NEXT: v_mov_b32_e32 v0, s5
1300 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1301 ; VI-NEXT: s_lshr_b32 s5, s8, 16
1302 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1303 ; VI-NEXT: v_mov_b32_e32 v4, s5
1304 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1305 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1306 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1307 ; VI-NEXT: s_lshr_b32 s5, s8, 8
1308 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1309 ; VI-NEXT: v_mov_b32_e32 v4, s5
1310 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1311 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1312 ; VI-NEXT: v_mov_b32_e32 v5, s8
1313 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1314 ; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1315 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1316 ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1317 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1318 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1320 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1321 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1325 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
1326 ; the compiler doesn't crash.
1327 define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1328 ; SI-LABEL: insert_split_bb:
1329 ; SI: ; %bb.0: ; %entry
1330 ; SI-NEXT: s_load_dword s0, s[4:5], 0x4
1331 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
1332 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1333 ; SI-NEXT: s_cmp_lg_u32 s0, 0
1334 ; SI-NEXT: s_cbranch_scc0 BB26_2
1335 ; SI-NEXT: ; %bb.1: ; %else
1336 ; SI-NEXT: s_load_dword s1, s[6:7], 0x1
1337 ; SI-NEXT: s_branch BB26_3
1338 ; SI-NEXT: BB26_2: ; %if
1339 ; SI-NEXT: s_load_dword s1, s[6:7], 0x0
1340 ; SI-NEXT: BB26_3: ; %endif
1341 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1342 ; SI-NEXT: v_mov_b32_e32 v0, s0
1343 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1344 ; SI-NEXT: s_mov_b32 s6, -1
1345 ; SI-NEXT: v_mov_b32_e32 v1, s1
1346 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1349 ; VI-LABEL: insert_split_bb:
1350 ; VI: ; %bb.0: ; %entry
1351 ; VI-NEXT: s_load_dword s0, s[4:5], 0x10
1352 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
1353 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1354 ; VI-NEXT: s_cmp_lg_u32 s0, 0
1355 ; VI-NEXT: s_cbranch_scc0 BB26_2
1356 ; VI-NEXT: ; %bb.1: ; %else
1357 ; VI-NEXT: s_load_dword s1, s[6:7], 0x4
1358 ; VI-NEXT: s_branch BB26_3
1359 ; VI-NEXT: BB26_2: ; %if
1360 ; VI-NEXT: s_load_dword s1, s[6:7], 0x0
1361 ; VI-NEXT: BB26_3: ; %endif
1362 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1363 ; VI-NEXT: v_mov_b32_e32 v0, s0
1364 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1365 ; VI-NEXT: s_mov_b32 s6, -1
1366 ; VI-NEXT: v_mov_b32_e32 v1, s1
1367 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1370 %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1371 %1 = icmp eq i32 %a, 0
1372 br i1 %1, label %if, label %else
1375 %2 = load i32, i32 addrspace(1)* %in
1376 %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1380 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1381 %5 = load i32, i32 addrspace(1)* %4
1382 %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1386 %7 = phi <2 x i32> [%3, %if], [%6, %else]
1387 store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1391 define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1392 ; SI-LABEL: dynamic_insertelement_v2f64:
1394 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1395 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc
1396 ; SI-NEXT: s_load_dword s4, s[4:5], 0x18
1397 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
1398 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1399 ; SI-NEXT: s_mov_b32 s2, -1
1400 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1401 ; SI-NEXT: v_mov_b32_e32 v0, s11
1402 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1403 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1404 ; SI-NEXT: v_mov_b32_e32 v0, s10
1405 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1406 ; SI-NEXT: v_mov_b32_e32 v0, s9
1407 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1408 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1409 ; SI-NEXT: v_mov_b32_e32 v0, s8
1410 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1411 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1414 ; VI-LABEL: dynamic_insertelement_v2f64:
1416 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1417 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30
1418 ; VI-NEXT: s_load_dword s4, s[4:5], 0x60
1419 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
1420 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1421 ; VI-NEXT: s_mov_b32 s2, -1
1422 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1423 ; VI-NEXT: v_mov_b32_e32 v0, s11
1424 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1425 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1426 ; VI-NEXT: v_mov_b32_e32 v0, s10
1427 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1428 ; VI-NEXT: v_mov_b32_e32 v0, s9
1429 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1430 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1431 ; VI-NEXT: v_mov_b32_e32 v0, s8
1432 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1433 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1435 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1436 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1440 define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1441 ; SI-LABEL: dynamic_insertelement_v2i64:
1443 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1444 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1445 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
1446 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1447 ; SI-NEXT: s_mov_b32 s2, -1
1448 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1449 ; SI-NEXT: v_mov_b32_e32 v0, s11
1450 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1451 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1452 ; SI-NEXT: v_mov_b32_e32 v0, s10
1453 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1454 ; SI-NEXT: v_mov_b32_e32 v0, s9
1455 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1456 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1457 ; SI-NEXT: v_mov_b32_e32 v0, s8
1458 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1459 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1462 ; VI-LABEL: dynamic_insertelement_v2i64:
1464 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1465 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1466 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
1467 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1468 ; VI-NEXT: s_mov_b32 s2, -1
1469 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1470 ; VI-NEXT: v_mov_b32_e32 v0, s11
1471 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1472 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1473 ; VI-NEXT: v_mov_b32_e32 v0, s10
1474 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1475 ; VI-NEXT: v_mov_b32_e32 v0, s9
1476 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1477 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1478 ; VI-NEXT: v_mov_b32_e32 v0, s8
1479 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1480 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1482 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1483 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1487 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1488 ; SI-LABEL: dynamic_insertelement_v3i64:
1490 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1491 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1492 ; SI-NEXT: s_load_dword s6, s[4:5], 0x10
1493 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1494 ; SI-NEXT: s_mov_b32 s2, -1
1495 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1496 ; SI-NEXT: v_mov_b32_e32 v0, s13
1497 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
1498 ; SI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1499 ; SI-NEXT: v_mov_b32_e32 v0, s12
1500 ; SI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1501 ; SI-NEXT: v_mov_b32_e32 v0, s11
1502 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1503 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1504 ; SI-NEXT: v_mov_b32_e32 v0, s10
1505 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1506 ; SI-NEXT: v_mov_b32_e32 v0, s9
1507 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1508 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1509 ; SI-NEXT: v_mov_b32_e32 v0, s8
1510 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1511 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1512 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1515 ; VI-LABEL: dynamic_insertelement_v3i64:
1517 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1518 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1519 ; VI-NEXT: s_load_dword s6, s[4:5], 0x40
1520 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1521 ; VI-NEXT: s_mov_b32 s2, -1
1522 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1523 ; VI-NEXT: v_mov_b32_e32 v0, s13
1524 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 2
1525 ; VI-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5]
1526 ; VI-NEXT: v_mov_b32_e32 v0, s12
1527 ; VI-NEXT: v_cndmask_b32_e64 v4, v0, 5, s[4:5]
1528 ; VI-NEXT: v_mov_b32_e32 v0, s11
1529 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1530 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1531 ; VI-NEXT: v_mov_b32_e32 v0, s10
1532 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1533 ; VI-NEXT: v_mov_b32_e32 v0, s9
1534 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1535 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1536 ; VI-NEXT: v_mov_b32_e32 v0, s8
1537 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1538 ; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1539 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1541 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1542 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1546 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1547 ; SI-LABEL: dynamic_insertelement_v4f64:
1549 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1550 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1551 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
1552 ; SI-NEXT: v_mov_b32_e32 v4, 0x40200000
1553 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1554 ; SI-NEXT: s_mov_b32 s2, -1
1555 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1556 ; SI-NEXT: v_mov_b32_e32 v0, s11
1557 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1558 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1559 ; SI-NEXT: v_mov_b32_e32 v0, s10
1560 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1561 ; SI-NEXT: v_mov_b32_e32 v0, s9
1562 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1563 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1564 ; SI-NEXT: v_mov_b32_e32 v0, s8
1565 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1566 ; SI-NEXT: v_mov_b32_e32 v5, s15
1567 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1568 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1569 ; SI-NEXT: v_mov_b32_e32 v5, s14
1570 ; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1571 ; SI-NEXT: v_mov_b32_e32 v5, s13
1572 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1573 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1574 ; SI-NEXT: v_mov_b32_e32 v4, s12
1575 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1576 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1577 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1580 ; VI-LABEL: dynamic_insertelement_v4f64:
1582 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1583 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1584 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
1585 ; VI-NEXT: v_mov_b32_e32 v4, 0x40200000
1586 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1587 ; VI-NEXT: s_mov_b32 s2, -1
1588 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1589 ; VI-NEXT: v_mov_b32_e32 v0, s11
1590 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1591 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1592 ; VI-NEXT: v_mov_b32_e32 v0, s10
1593 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1594 ; VI-NEXT: v_mov_b32_e32 v0, s9
1595 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1596 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1597 ; VI-NEXT: v_mov_b32_e32 v0, s8
1598 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1599 ; VI-NEXT: v_mov_b32_e32 v5, s15
1600 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1601 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1602 ; VI-NEXT: v_mov_b32_e32 v5, s14
1603 ; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1604 ; VI-NEXT: v_mov_b32_e32 v5, s13
1605 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1606 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1607 ; VI-NEXT: v_mov_b32_e32 v4, s12
1608 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1609 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1610 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1612 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1613 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1617 define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1618 ; SI-LABEL: dynamic_insertelement_v8f64:
1620 ; SI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0
1621 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
1622 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1623 ; SI-NEXT: v_mov_b32_e32 v16, 64
1624 ; SI-NEXT: s_mov_b32 s27, 0x100f000
1625 ; SI-NEXT: s_mov_b32 s26, -1
1626 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1627 ; SI-NEXT: v_mov_b32_e32 v0, s8
1628 ; SI-NEXT: s_and_b32 s4, s4, 7
1629 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1630 ; SI-NEXT: v_mov_b32_e32 v1, s9
1631 ; SI-NEXT: v_mov_b32_e32 v2, s10
1632 ; SI-NEXT: v_mov_b32_e32 v3, s11
1633 ; SI-NEXT: v_mov_b32_e32 v4, s12
1634 ; SI-NEXT: v_mov_b32_e32 v5, s13
1635 ; SI-NEXT: v_mov_b32_e32 v6, s14
1636 ; SI-NEXT: v_mov_b32_e32 v7, s15
1637 ; SI-NEXT: v_mov_b32_e32 v8, s16
1638 ; SI-NEXT: v_mov_b32_e32 v9, s17
1639 ; SI-NEXT: v_mov_b32_e32 v10, s18
1640 ; SI-NEXT: v_mov_b32_e32 v11, s19
1641 ; SI-NEXT: v_mov_b32_e32 v12, s20
1642 ; SI-NEXT: v_mov_b32_e32 v13, s21
1643 ; SI-NEXT: v_mov_b32_e32 v14, s22
1644 ; SI-NEXT: v_mov_b32_e32 v15, s23
1645 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1646 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1647 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1648 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1649 ; SI-NEXT: v_or_b32_e32 v16, s4, v16
1650 ; SI-NEXT: v_mov_b32_e32 v0, 0
1651 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
1652 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1653 ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1654 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1655 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1656 ; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1657 ; SI-NEXT: s_waitcnt vmcnt(0)
1658 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1659 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1660 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1661 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1664 ; VI-LABEL: dynamic_insertelement_v8f64:
1666 ; VI-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x0
1667 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
1668 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1669 ; VI-NEXT: v_mov_b32_e32 v16, 64
1670 ; VI-NEXT: s_mov_b32 s27, 0x1100f000
1671 ; VI-NEXT: s_mov_b32 s26, -1
1672 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1673 ; VI-NEXT: v_mov_b32_e32 v0, s8
1674 ; VI-NEXT: s_and_b32 s4, s4, 7
1675 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1676 ; VI-NEXT: v_mov_b32_e32 v1, s9
1677 ; VI-NEXT: v_mov_b32_e32 v2, s10
1678 ; VI-NEXT: v_mov_b32_e32 v3, s11
1679 ; VI-NEXT: v_mov_b32_e32 v4, s12
1680 ; VI-NEXT: v_mov_b32_e32 v5, s13
1681 ; VI-NEXT: v_mov_b32_e32 v6, s14
1682 ; VI-NEXT: v_mov_b32_e32 v7, s15
1683 ; VI-NEXT: v_mov_b32_e32 v8, s16
1684 ; VI-NEXT: v_mov_b32_e32 v9, s17
1685 ; VI-NEXT: v_mov_b32_e32 v10, s18
1686 ; VI-NEXT: v_mov_b32_e32 v11, s19
1687 ; VI-NEXT: v_mov_b32_e32 v12, s20
1688 ; VI-NEXT: v_mov_b32_e32 v13, s21
1689 ; VI-NEXT: v_mov_b32_e32 v14, s22
1690 ; VI-NEXT: v_mov_b32_e32 v15, s23
1691 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1692 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1693 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1694 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1695 ; VI-NEXT: v_or_b32_e32 v16, s4, v16
1696 ; VI-NEXT: v_mov_b32_e32 v0, 0
1697 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
1698 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen
1699 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64
1700 ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80
1701 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96
1702 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112
1703 ; VI-NEXT: s_waitcnt vmcnt(0)
1704 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[24:27], 0 offset:48
1705 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[24:27], 0 offset:32
1706 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:16
1707 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[24:27], 0
1709 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1710 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1714 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1716 attributes #0 = { nounwind }
1717 attributes #1 = { nounwind readnone }