1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
5 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
7 define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
8 ; GFX9-LABEL: s_insertelement_v2i16_0:
10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
16 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
17 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
20 ; CIVI-LABEL: s_insertelement_v2i16_0:
22 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
23 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
24 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
25 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
26 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
27 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
28 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
29 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
30 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
31 ; CIVI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX11-LABEL: s_insertelement_v2i16_0:
36 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
37 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
38 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
39 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
41 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
42 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
43 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
44 ; GFX11-NEXT: s_endpgm
45 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
46 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
47 store <2 x i16> %vecins, ptr addrspace(1) %out
52 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
53 ; GFX9-LABEL: s_insertelement_v2i16_0_reg:
55 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
56 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
57 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
58 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
60 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s4, s2
62 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
63 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
66 ; VI-LABEL: s_insertelement_v2i16_0_reg:
68 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
69 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
70 ; VI-NEXT: s_waitcnt lgkmcnt(0)
71 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
72 ; VI-NEXT: v_mov_b32_e32 v0, s0
73 ; VI-NEXT: v_mov_b32_e32 v1, s1
74 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
75 ; VI-NEXT: s_waitcnt lgkmcnt(0)
76 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
77 ; VI-NEXT: s_or_b32 s0, s0, s1
78 ; VI-NEXT: v_mov_b32_e32 v2, s0
79 ; VI-NEXT: flat_store_dword v[0:1], v2
82 ; CI-LABEL: s_insertelement_v2i16_0_reg:
84 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
85 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
86 ; CI-NEXT: s_waitcnt lgkmcnt(0)
87 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
88 ; CI-NEXT: v_mov_b32_e32 v0, s0
89 ; CI-NEXT: v_mov_b32_e32 v1, s1
90 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
91 ; CI-NEXT: s_waitcnt lgkmcnt(0)
92 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
93 ; CI-NEXT: s_or_b32 s0, s1, s0
94 ; CI-NEXT: v_mov_b32_e32 v2, s0
95 ; CI-NEXT: flat_store_dword v[0:1], v2
98 ; GFX11-LABEL: s_insertelement_v2i16_0_reg:
100 ; GFX11-NEXT: s_clause 0x1
101 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
102 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
103 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
105 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s4, s2
107 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
108 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
109 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
110 ; GFX11-NEXT: s_endpgm
111 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
112 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
113 store <2 x i16> %vecins, ptr addrspace(1) %out
117 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
118 ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
120 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
121 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
122 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
123 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
125 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
126 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
127 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s2
128 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
129 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
130 ; GFX9-NEXT: ;;#ASMSTART
131 ; GFX9-NEXT: ; use s2
132 ; GFX9-NEXT: ;;#ASMEND
133 ; GFX9-NEXT: s_endpgm
135 ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
137 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
138 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
139 ; VI-NEXT: s_waitcnt lgkmcnt(0)
140 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
141 ; VI-NEXT: v_mov_b32_e32 v0, s0
142 ; VI-NEXT: v_mov_b32_e32 v1, s1
143 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
144 ; VI-NEXT: s_waitcnt lgkmcnt(0)
145 ; VI-NEXT: s_lshr_b32 s1, s2, 16
146 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
147 ; VI-NEXT: s_or_b32 s0, s0, s2
148 ; VI-NEXT: v_mov_b32_e32 v2, s0
149 ; VI-NEXT: flat_store_dword v[0:1], v2
150 ; VI-NEXT: ;;#ASMSTART
155 ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
157 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
158 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
159 ; CI-NEXT: s_waitcnt lgkmcnt(0)
160 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
161 ; CI-NEXT: v_mov_b32_e32 v0, s0
162 ; CI-NEXT: v_mov_b32_e32 v1, s1
163 ; CI-NEXT: s_and_b32 s0, s4, 0xffff
164 ; CI-NEXT: s_waitcnt lgkmcnt(0)
165 ; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
166 ; CI-NEXT: s_or_b32 s0, s0, s1
167 ; CI-NEXT: v_mov_b32_e32 v2, s0
168 ; CI-NEXT: s_lshr_b32 s2, s2, 16
169 ; CI-NEXT: flat_store_dword v[0:1], v2
170 ; CI-NEXT: ;;#ASMSTART
175 ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
177 ; GFX11-NEXT: s_clause 0x1
178 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
179 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
180 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
182 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
184 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
185 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s2
186 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
187 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
188 ; GFX11-NEXT: ;;#ASMSTART
189 ; GFX11-NEXT: ; use s2
190 ; GFX11-NEXT: ;;#ASMEND
191 ; GFX11-NEXT: s_endpgm
192 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
193 %elt1 = extractelement <2 x i16> %vec, i32 1
194 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
195 store <2 x i16> %vecins, ptr addrspace(1) %out
196 %use1 = zext i16 %elt1 to i32
197 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
201 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
202 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
204 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
205 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
206 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
209 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s4, s2
211 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
212 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
213 ; GFX9-NEXT: s_endpgm
215 ; VI-LABEL: s_insertelement_v2i16_0_reghi:
217 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
218 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
219 ; VI-NEXT: s_waitcnt lgkmcnt(0)
220 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
221 ; VI-NEXT: v_mov_b32_e32 v0, s0
222 ; VI-NEXT: v_mov_b32_e32 v2, s4
223 ; VI-NEXT: v_mov_b32_e32 v1, s1
224 ; VI-NEXT: s_waitcnt lgkmcnt(0)
225 ; VI-NEXT: s_lshr_b32 s0, s2, 16
226 ; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16
227 ; VI-NEXT: flat_store_dword v[0:1], v2
230 ; CI-LABEL: s_insertelement_v2i16_0_reghi:
232 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
233 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
234 ; CI-NEXT: s_waitcnt lgkmcnt(0)
235 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
236 ; CI-NEXT: v_mov_b32_e32 v0, s0
237 ; CI-NEXT: v_mov_b32_e32 v2, s4
238 ; CI-NEXT: v_mov_b32_e32 v1, s1
239 ; CI-NEXT: s_waitcnt lgkmcnt(0)
240 ; CI-NEXT: s_lshr_b32 s0, s2, 16
241 ; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16
242 ; CI-NEXT: flat_store_dword v[0:1], v2
245 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi:
247 ; GFX11-NEXT: s_clause 0x1
248 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
249 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
250 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
252 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX11-NEXT: s_pack_hh_b32_b16 s2, s4, s2
254 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
255 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
256 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
257 ; GFX11-NEXT: s_endpgm
258 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
259 %elt.hi = lshr i32 %elt.arg, 16
260 %elt = trunc i32 %elt.hi to i16
261 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
262 store <2 x i16> %vecins, ptr addrspace(1) %out
266 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
267 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
269 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
270 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
271 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
272 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
273 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
274 ; GFX9-NEXT: s_lshr_b32 s3, s4, 16
275 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2
277 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
278 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
279 ; GFX9-NEXT: ;;#ASMSTART
280 ; GFX9-NEXT: ; use s3
281 ; GFX9-NEXT: ;;#ASMEND
282 ; GFX9-NEXT: s_endpgm
284 ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
286 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
287 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
288 ; VI-NEXT: s_waitcnt lgkmcnt(0)
289 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
290 ; VI-NEXT: v_mov_b32_e32 v1, s1
291 ; VI-NEXT: v_mov_b32_e32 v2, s4
292 ; VI-NEXT: v_mov_b32_e32 v0, s0
293 ; VI-NEXT: s_lshr_b32 s0, s4, 16
294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
295 ; VI-NEXT: s_lshr_b32 s1, s2, 16
296 ; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
297 ; VI-NEXT: flat_store_dword v[0:1], v2
298 ; VI-NEXT: ;;#ASMSTART
303 ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
305 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
306 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
307 ; CI-NEXT: s_waitcnt lgkmcnt(0)
308 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
309 ; CI-NEXT: v_mov_b32_e32 v1, s1
310 ; CI-NEXT: v_mov_b32_e32 v2, s4
311 ; CI-NEXT: v_mov_b32_e32 v0, s0
312 ; CI-NEXT: s_lshr_b32 s0, s4, 16
313 ; CI-NEXT: s_waitcnt lgkmcnt(0)
314 ; CI-NEXT: s_lshr_b32 s1, s2, 16
315 ; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
316 ; CI-NEXT: flat_store_dword v[0:1], v2
317 ; CI-NEXT: ;;#ASMSTART
322 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
324 ; GFX11-NEXT: s_clause 0x1
325 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
326 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
327 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
329 ; GFX11-NEXT: s_lshr_b32 s3, s4, 16
330 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2
332 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
333 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
334 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
335 ; GFX11-NEXT: ;;#ASMSTART
336 ; GFX11-NEXT: ; use s3
337 ; GFX11-NEXT: ;;#ASMEND
338 ; GFX11-NEXT: s_endpgm
339 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
340 %elt.hi = lshr i32 %elt.arg, 16
341 %elt = trunc i32 %elt.hi to i16
342 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
343 store <2 x i16> %vecins, ptr addrspace(1) %out
344 %use1 = zext i16 %elt to i32
345 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
349 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
350 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
352 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
353 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
354 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
355 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
357 ; GFX9-NEXT: s_lshr_b32 s3, s4, 16
358 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
359 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
360 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2
361 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
362 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
363 ; GFX9-NEXT: ;;#ASMSTART
364 ; GFX9-NEXT: ; use s3
365 ; GFX9-NEXT: ;;#ASMEND
366 ; GFX9-NEXT: ;;#ASMSTART
367 ; GFX9-NEXT: ; use s2
368 ; GFX9-NEXT: ;;#ASMEND
369 ; GFX9-NEXT: s_endpgm
371 ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
373 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
374 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
375 ; VI-NEXT: s_waitcnt lgkmcnt(0)
376 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
377 ; VI-NEXT: v_mov_b32_e32 v1, s1
378 ; VI-NEXT: v_mov_b32_e32 v2, s4
379 ; VI-NEXT: v_mov_b32_e32 v0, s0
380 ; VI-NEXT: s_lshr_b32 s0, s4, 16
381 ; VI-NEXT: s_waitcnt lgkmcnt(0)
382 ; VI-NEXT: s_lshr_b32 s1, s2, 16
383 ; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
384 ; VI-NEXT: flat_store_dword v[0:1], v2
385 ; VI-NEXT: ;;#ASMSTART
388 ; VI-NEXT: ;;#ASMSTART
393 ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
395 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
396 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
397 ; CI-NEXT: s_waitcnt lgkmcnt(0)
398 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
399 ; CI-NEXT: v_mov_b32_e32 v1, s1
400 ; CI-NEXT: v_mov_b32_e32 v2, s4
401 ; CI-NEXT: v_mov_b32_e32 v0, s0
402 ; CI-NEXT: s_lshr_b32 s0, s4, 16
403 ; CI-NEXT: s_waitcnt lgkmcnt(0)
404 ; CI-NEXT: s_lshr_b32 s1, s2, 16
405 ; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
406 ; CI-NEXT: flat_store_dword v[0:1], v2
407 ; CI-NEXT: ;;#ASMSTART
410 ; CI-NEXT: ;;#ASMSTART
415 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
417 ; GFX11-NEXT: s_clause 0x1
418 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
419 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
420 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
422 ; GFX11-NEXT: s_lshr_b32 s3, s4, 16
423 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
424 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
425 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
426 ; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2
427 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
428 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
429 ; GFX11-NEXT: ;;#ASMSTART
430 ; GFX11-NEXT: ; use s3
431 ; GFX11-NEXT: ;;#ASMEND
432 ; GFX11-NEXT: ;;#ASMSTART
433 ; GFX11-NEXT: ; use s2
434 ; GFX11-NEXT: ;;#ASMEND
435 ; GFX11-NEXT: s_endpgm
436 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
437 %elt.hi = lshr i32 %elt.arg, 16
438 %elt = trunc i32 %elt.hi to i16
439 %vec.hi = extractelement <2 x i16> %vec, i32 1
440 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
441 store <2 x i16> %vecins, ptr addrspace(1) %out
442 %use1 = zext i16 %elt to i32
443 %vec.hi.use1 = zext i16 %vec.hi to i32
445 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
446 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
450 define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
451 ; GFX9-LABEL: s_insertelement_v2i16_1:
453 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
454 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
455 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
457 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
459 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
460 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
461 ; GFX9-NEXT: s_endpgm
463 ; CIVI-LABEL: s_insertelement_v2i16_1:
465 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
466 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
467 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
468 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
469 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
470 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
471 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
472 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000
473 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
474 ; CIVI-NEXT: flat_store_dword v[0:1], v2
475 ; CIVI-NEXT: s_endpgm
477 ; GFX11-LABEL: s_insertelement_v2i16_1:
479 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
480 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
482 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
484 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
485 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
486 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
487 ; GFX11-NEXT: s_endpgm
488 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
489 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
490 store <2 x i16> %vecins, ptr addrspace(1) %out
494 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
495 ; GFX9-LABEL: s_insertelement_v2i16_1_reg:
497 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
498 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
499 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
502 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
503 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
504 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
505 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
506 ; GFX9-NEXT: s_endpgm
508 ; VI-LABEL: s_insertelement_v2i16_1_reg:
510 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
511 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
512 ; VI-NEXT: s_waitcnt lgkmcnt(0)
513 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
514 ; VI-NEXT: v_mov_b32_e32 v0, s0
515 ; VI-NEXT: v_mov_b32_e32 v1, s1
516 ; VI-NEXT: s_lshl_b32 s0, s4, 16
517 ; VI-NEXT: s_waitcnt lgkmcnt(0)
518 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
519 ; VI-NEXT: s_or_b32 s0, s1, s0
520 ; VI-NEXT: v_mov_b32_e32 v2, s0
521 ; VI-NEXT: flat_store_dword v[0:1], v2
524 ; CI-LABEL: s_insertelement_v2i16_1_reg:
526 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
527 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
528 ; CI-NEXT: s_waitcnt lgkmcnt(0)
529 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
530 ; CI-NEXT: v_mov_b32_e32 v0, s0
531 ; CI-NEXT: v_mov_b32_e32 v1, s1
532 ; CI-NEXT: s_lshl_b32 s1, s4, 16
533 ; CI-NEXT: s_waitcnt lgkmcnt(0)
534 ; CI-NEXT: s_and_b32 s0, s2, 0xffff
535 ; CI-NEXT: s_or_b32 s0, s0, s1
536 ; CI-NEXT: v_mov_b32_e32 v2, s0
537 ; CI-NEXT: flat_store_dword v[0:1], v2
540 ; GFX11-LABEL: s_insertelement_v2i16_1_reg:
542 ; GFX11-NEXT: s_clause 0x1
543 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
544 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
545 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
546 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
547 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
548 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
549 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
550 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
551 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
552 ; GFX11-NEXT: s_endpgm
553 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
554 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
555 store <2 x i16> %vecins, ptr addrspace(1) %out
559 define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
560 ; GFX9-LABEL: s_insertelement_v2f16_0:
562 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
563 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
564 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
565 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
568 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
569 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
570 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
571 ; GFX9-NEXT: s_endpgm
573 ; CIVI-LABEL: s_insertelement_v2f16_0:
575 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
576 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
577 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
578 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
579 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
580 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
581 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
582 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
583 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
584 ; CIVI-NEXT: flat_store_dword v[0:1], v2
585 ; CIVI-NEXT: s_endpgm
587 ; GFX11-LABEL: s_insertelement_v2f16_0:
589 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
590 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
591 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
592 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
593 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
594 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
595 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
596 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
597 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
598 ; GFX11-NEXT: s_endpgm
599 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
600 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
601 store <2 x half> %vecins, ptr addrspace(1) %out
605 define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
606 ; GFX9-LABEL: s_insertelement_v2f16_1:
608 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
609 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
610 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
612 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
613 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
614 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
615 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
616 ; GFX9-NEXT: s_endpgm
618 ; CIVI-LABEL: s_insertelement_v2f16_1:
620 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
621 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
622 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
623 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
624 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
625 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
626 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
627 ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000
628 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
629 ; CIVI-NEXT: flat_store_dword v[0:1], v2
630 ; CIVI-NEXT: s_endpgm
632 ; GFX11-LABEL: s_insertelement_v2f16_1:
634 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
635 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
637 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
639 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
640 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
641 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
642 ; GFX11-NEXT: s_endpgm
643 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
644 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
645 store <2 x half> %vecins, ptr addrspace(1) %out
649 define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
650 ; GFX9-LABEL: v_insertelement_v2i16_0:
652 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
653 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
654 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7
655 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
657 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
658 ; GFX9-NEXT: s_waitcnt vmcnt(0)
659 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
660 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
661 ; GFX9-NEXT: s_endpgm
663 ; VI-LABEL: v_insertelement_v2i16_0:
665 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
666 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
667 ; VI-NEXT: s_waitcnt lgkmcnt(0)
668 ; VI-NEXT: v_mov_b32_e32 v1, s3
669 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
670 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
671 ; VI-NEXT: flat_load_dword v3, v[0:1]
672 ; VI-NEXT: v_mov_b32_e32 v1, s1
673 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
674 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
675 ; VI-NEXT: s_waitcnt vmcnt(0)
676 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
677 ; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
678 ; VI-NEXT: flat_store_dword v[0:1], v2
681 ; CI-LABEL: v_insertelement_v2i16_0:
683 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
684 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
685 ; CI-NEXT: s_waitcnt lgkmcnt(0)
686 ; CI-NEXT: v_mov_b32_e32 v1, s3
687 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
688 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
689 ; CI-NEXT: flat_load_dword v3, v[0:1]
690 ; CI-NEXT: v_mov_b32_e32 v1, s1
691 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
692 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
693 ; CI-NEXT: s_waitcnt vmcnt(0)
694 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
695 ; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
696 ; CI-NEXT: flat_store_dword v[0:1], v2
699 ; GFX11-LABEL: v_insertelement_v2i16_0:
701 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
702 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
703 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
704 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
705 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
706 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
707 ; GFX11-NEXT: s_movk_i32 s2, 0x3e7
708 ; GFX11-NEXT: s_waitcnt vmcnt(0)
709 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
710 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
711 ; GFX11-NEXT: s_endpgm
712 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
713 %tid.ext = sext i32 %tid to i64
714 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
715 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
716 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
717 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
718 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
722 define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 {
723 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
725 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
726 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
727 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
728 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302
729 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
731 ; GFX9-NEXT: s_waitcnt vmcnt(0)
732 ; GFX9-NEXT: v_perm_b32 v1, v1, s4, v2
733 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
734 ; GFX9-NEXT: s_endpgm
736 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
738 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
739 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
740 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
741 ; VI-NEXT: s_waitcnt lgkmcnt(0)
742 ; VI-NEXT: v_mov_b32_e32 v1, s3
743 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
744 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
745 ; VI-NEXT: flat_load_dword v3, v[0:1]
746 ; VI-NEXT: v_mov_b32_e32 v1, s1
747 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
748 ; VI-NEXT: v_mov_b32_e32 v2, 0x3020706
749 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
750 ; VI-NEXT: s_waitcnt vmcnt(0)
751 ; VI-NEXT: v_perm_b32 v2, s4, v3, v2
752 ; VI-NEXT: flat_store_dword v[0:1], v2
755 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
757 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
758 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
759 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
760 ; CI-NEXT: s_waitcnt lgkmcnt(0)
761 ; CI-NEXT: v_mov_b32_e32 v1, s3
762 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
763 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
764 ; CI-NEXT: flat_load_dword v3, v[0:1]
765 ; CI-NEXT: v_mov_b32_e32 v1, s1
766 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
767 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
768 ; CI-NEXT: s_waitcnt vmcnt(0)
769 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
770 ; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16
771 ; CI-NEXT: flat_store_dword v[0:1], v2
774 ; GFX11-LABEL: v_insertelement_v2i16_0_reghi:
776 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
777 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
778 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
779 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
780 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
781 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
782 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
783 ; GFX11-NEXT: s_waitcnt vmcnt(0)
784 ; GFX11-NEXT: v_perm_b32 v1, v1, s4, 0x7060302
785 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
786 ; GFX11-NEXT: s_endpgm
787 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
788 %tid.ext = sext i32 %tid to i64
789 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
790 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
791 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
792 %elt.hi = lshr i32 %elt.arg, 16
793 %elt = trunc i32 %elt.hi to i16
794 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
795 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
799 define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
800 ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
802 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
803 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
804 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
805 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
806 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
807 ; GFX9-NEXT: s_waitcnt vmcnt(0)
808 ; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1
809 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
810 ; GFX9-NEXT: s_endpgm
812 ; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
814 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
815 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
816 ; VI-NEXT: s_waitcnt lgkmcnt(0)
817 ; VI-NEXT: v_mov_b32_e32 v1, s3
818 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
819 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
820 ; VI-NEXT: flat_load_dword v3, v[0:1]
821 ; VI-NEXT: v_mov_b32_e32 v1, s1
822 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
823 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
824 ; VI-NEXT: s_waitcnt vmcnt(0)
825 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
826 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
827 ; VI-NEXT: flat_store_dword v[0:1], v2
830 ; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
832 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
833 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
834 ; CI-NEXT: s_waitcnt lgkmcnt(0)
835 ; CI-NEXT: v_mov_b32_e32 v1, s3
836 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
837 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
838 ; CI-NEXT: flat_load_dword v3, v[0:1]
839 ; CI-NEXT: v_mov_b32_e32 v1, s1
840 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
841 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
842 ; CI-NEXT: s_waitcnt vmcnt(0)
843 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
844 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
845 ; CI-NEXT: flat_store_dword v[0:1], v2
848 ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm:
850 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
851 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
852 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
853 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
854 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
855 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
856 ; GFX11-NEXT: s_waitcnt vmcnt(0)
857 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
858 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
859 ; GFX11-NEXT: s_endpgm
860 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
861 %tid.ext = sext i32 %tid to i64
862 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
863 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
864 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
865 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
866 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
870 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
871 define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
872 ; GFX9-LABEL: v_insertelement_v2i16_1:
874 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
875 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
876 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
877 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
878 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
879 ; GFX9-NEXT: s_movk_i32 s2, 0x3e7
880 ; GFX9-NEXT: s_waitcnt vmcnt(0)
881 ; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2
882 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
883 ; GFX9-NEXT: s_endpgm
885 ; VI-LABEL: v_insertelement_v2i16_1:
887 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
888 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
889 ; VI-NEXT: s_waitcnt lgkmcnt(0)
890 ; VI-NEXT: v_mov_b32_e32 v1, s3
891 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
892 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
893 ; VI-NEXT: flat_load_dword v3, v[0:1]
894 ; VI-NEXT: v_mov_b32_e32 v1, s1
895 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
896 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000
897 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
898 ; VI-NEXT: s_waitcnt vmcnt(0)
899 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
900 ; VI-NEXT: flat_store_dword v[0:1], v2
903 ; CI-LABEL: v_insertelement_v2i16_1:
905 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
906 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
907 ; CI-NEXT: s_waitcnt lgkmcnt(0)
908 ; CI-NEXT: v_mov_b32_e32 v1, s3
909 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
910 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
911 ; CI-NEXT: flat_load_dword v3, v[0:1]
912 ; CI-NEXT: v_mov_b32_e32 v1, s1
913 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
914 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
915 ; CI-NEXT: s_waitcnt vmcnt(0)
916 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
917 ; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2
918 ; CI-NEXT: flat_store_dword v[0:1], v2
921 ; GFX11-LABEL: v_insertelement_v2i16_1:
923 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
924 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
926 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
927 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
928 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
929 ; GFX11-NEXT: s_movk_i32 s2, 0x3e7
930 ; GFX11-NEXT: s_waitcnt vmcnt(0)
931 ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
932 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
933 ; GFX11-NEXT: s_endpgm
934 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
935 %tid.ext = sext i32 %tid to i64
936 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
937 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
938 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
939 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
940 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
944 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
945 ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
947 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
948 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
949 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
950 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
951 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
952 ; GFX9-NEXT: s_waitcnt vmcnt(0)
953 ; GFX9-NEXT: v_perm_b32 v1, -15, v1, v2
954 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
955 ; GFX9-NEXT: s_endpgm
957 ; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
959 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
960 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
961 ; VI-NEXT: s_waitcnt lgkmcnt(0)
962 ; VI-NEXT: v_mov_b32_e32 v1, s3
963 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
964 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
965 ; VI-NEXT: flat_load_dword v3, v[0:1]
966 ; VI-NEXT: v_mov_b32_e32 v1, s1
967 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
968 ; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000
969 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
970 ; VI-NEXT: s_waitcnt vmcnt(0)
971 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
972 ; VI-NEXT: flat_store_dword v[0:1], v2
975 ; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
977 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
978 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
979 ; CI-NEXT: s_waitcnt lgkmcnt(0)
980 ; CI-NEXT: v_mov_b32_e32 v1, s3
981 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
982 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
983 ; CI-NEXT: flat_load_dword v3, v[0:1]
984 ; CI-NEXT: v_mov_b32_e32 v1, s1
985 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
986 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
987 ; CI-NEXT: s_waitcnt vmcnt(0)
988 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
989 ; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2
990 ; CI-NEXT: flat_store_dword v[0:1], v2
993 ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm:
995 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
996 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
998 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
999 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1000 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1001 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1002 ; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100
1003 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1004 ; GFX11-NEXT: s_endpgm
1005 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1006 %tid.ext = sext i32 %tid to i64
1007 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1008 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1009 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1010 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
1011 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1015 define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1016 ; GFX9-LABEL: v_insertelement_v2f16_0:
1018 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1019 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1020 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500
1021 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1022 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1023 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1024 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1025 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1026 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1027 ; GFX9-NEXT: s_endpgm
1029 ; VI-LABEL: v_insertelement_v2f16_0:
1031 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1032 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1033 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1034 ; VI-NEXT: v_mov_b32_e32 v1, s3
1035 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1036 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1037 ; VI-NEXT: flat_load_dword v3, v[0:1]
1038 ; VI-NEXT: v_mov_b32_e32 v1, s1
1039 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1040 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1041 ; VI-NEXT: s_waitcnt vmcnt(0)
1042 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1043 ; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2
1044 ; VI-NEXT: flat_store_dword v[0:1], v2
1047 ; CI-LABEL: v_insertelement_v2f16_0:
1049 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1050 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1051 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1052 ; CI-NEXT: v_mov_b32_e32 v1, s3
1053 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1054 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1055 ; CI-NEXT: flat_load_dword v3, v[0:1]
1056 ; CI-NEXT: v_mov_b32_e32 v1, s1
1057 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1058 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1059 ; CI-NEXT: s_waitcnt vmcnt(0)
1060 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1061 ; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2
1062 ; CI-NEXT: flat_store_dword v[0:1], v2
1065 ; GFX11-LABEL: v_insertelement_v2f16_0:
1067 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1068 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1069 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1070 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1071 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1072 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1073 ; GFX11-NEXT: s_movk_i32 s2, 0x4500
1074 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1075 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
1076 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1077 ; GFX11-NEXT: s_endpgm
1078 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1079 %tid.ext = sext i32 %tid to i64
1080 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1081 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1082 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1083 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
1084 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1088 define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1089 ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
1091 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1092 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1093 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1094 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1095 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1096 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1097 ; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1
1098 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1099 ; GFX9-NEXT: s_endpgm
1101 ; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
1103 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1104 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1105 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1106 ; VI-NEXT: v_mov_b32_e32 v1, s3
1107 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1108 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1109 ; VI-NEXT: flat_load_dword v3, v[0:1]
1110 ; VI-NEXT: v_mov_b32_e32 v1, s1
1111 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1112 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1113 ; VI-NEXT: s_waitcnt vmcnt(0)
1114 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1115 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
1116 ; VI-NEXT: flat_store_dword v[0:1], v2
1119 ; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
1121 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1122 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1123 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1124 ; CI-NEXT: v_mov_b32_e32 v1, s3
1125 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1126 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1127 ; CI-NEXT: flat_load_dword v3, v[0:1]
1128 ; CI-NEXT: v_mov_b32_e32 v1, s1
1129 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1130 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1131 ; CI-NEXT: s_waitcnt vmcnt(0)
1132 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1133 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
1134 ; CI-NEXT: flat_store_dword v[0:1], v2
1137 ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm:
1139 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1140 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1141 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1142 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1143 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1144 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1145 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1146 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
1147 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1148 ; GFX11-NEXT: s_endpgm
1149 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1150 %tid.ext = sext i32 %tid to i64
1151 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1152 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1153 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1154 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
1155 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1159 define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1160 ; GFX9-LABEL: v_insertelement_v2f16_1:
1162 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1163 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1164 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
1165 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1166 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1167 ; GFX9-NEXT: s_movk_i32 s2, 0x4500
1168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1169 ; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2
1170 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1171 ; GFX9-NEXT: s_endpgm
1173 ; VI-LABEL: v_insertelement_v2f16_1:
1175 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1176 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1177 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1178 ; VI-NEXT: v_mov_b32_e32 v1, s3
1179 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1180 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1181 ; VI-NEXT: flat_load_dword v3, v[0:1]
1182 ; VI-NEXT: v_mov_b32_e32 v1, s1
1183 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1184 ; VI-NEXT: v_mov_b32_e32 v2, 0x45000000
1185 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1186 ; VI-NEXT: s_waitcnt vmcnt(0)
1187 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1188 ; VI-NEXT: flat_store_dword v[0:1], v2
1191 ; CI-LABEL: v_insertelement_v2f16_1:
1193 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1194 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1195 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1196 ; CI-NEXT: v_mov_b32_e32 v1, s3
1197 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1198 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1199 ; CI-NEXT: flat_load_dword v3, v[0:1]
1200 ; CI-NEXT: v_mov_b32_e32 v1, s1
1201 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1202 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1203 ; CI-NEXT: s_waitcnt vmcnt(0)
1204 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1205 ; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2
1206 ; CI-NEXT: flat_store_dword v[0:1], v2
1209 ; GFX11-LABEL: v_insertelement_v2f16_1:
1211 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1212 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1213 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1214 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1215 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1216 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1217 ; GFX11-NEXT: s_movk_i32 s2, 0x4500
1218 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1219 ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
1220 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1221 ; GFX11-NEXT: s_endpgm
1222 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1223 %tid.ext = sext i32 %tid to i64
1224 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1225 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1226 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1227 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1228 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1232 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1233 ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1235 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1236 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1237 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
1238 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1240 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1241 ; GFX9-NEXT: v_perm_b32 v1, 35, v1, v2
1242 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1243 ; GFX9-NEXT: s_endpgm
1245 ; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1247 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1248 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1249 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1250 ; VI-NEXT: v_mov_b32_e32 v1, s3
1251 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1252 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1253 ; VI-NEXT: flat_load_dword v3, v[0:1]
1254 ; VI-NEXT: v_mov_b32_e32 v1, s1
1255 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1256 ; VI-NEXT: v_mov_b32_e32 v2, 0x230000
1257 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1258 ; VI-NEXT: s_waitcnt vmcnt(0)
1259 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1260 ; VI-NEXT: flat_store_dword v[0:1], v2
1263 ; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1265 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1266 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1267 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1268 ; CI-NEXT: v_mov_b32_e32 v1, s3
1269 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1270 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1271 ; CI-NEXT: flat_load_dword v3, v[0:1]
1272 ; CI-NEXT: v_mov_b32_e32 v1, s1
1273 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1274 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1275 ; CI-NEXT: s_waitcnt vmcnt(0)
1276 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1277 ; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2
1278 ; CI-NEXT: flat_store_dword v[0:1], v2
1281 ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm:
1283 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1284 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1285 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1286 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1287 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1288 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1289 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1290 ; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100
1291 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1292 ; GFX11-NEXT: s_endpgm
1293 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1294 %tid.ext = sext i32 %tid to i64
1295 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1296 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1297 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1298 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1299 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1303 ; FIXME: Enable for others when argument load not split
1304 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 {
1305 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1307 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1308 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1309 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1310 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1311 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
1312 ; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0
1313 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1314 ; GFX9-NEXT: s_lshl_b32 s2, s6, 4
1315 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1316 ; GFX9-NEXT: s_andn2_b32 s3, s7, s2
1317 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7
1318 ; GFX9-NEXT: s_or_b32 s2, s2, s3
1319 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1320 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1321 ; GFX9-NEXT: s_endpgm
1323 ; VI-LABEL: s_insertelement_v2i16_dynamic:
1325 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1326 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1327 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1328 ; VI-NEXT: s_load_dword s4, s[4:5], 0x0
1329 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1330 ; VI-NEXT: v_mov_b32_e32 v0, s0
1331 ; VI-NEXT: v_mov_b32_e32 v1, s1
1332 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1333 ; VI-NEXT: s_lshl_b32 s0, s4, 4
1334 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1335 ; VI-NEXT: s_andn2_b32 s1, s2, s0
1336 ; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1337 ; VI-NEXT: s_or_b32 s0, s0, s1
1338 ; VI-NEXT: v_mov_b32_e32 v2, s0
1339 ; VI-NEXT: flat_store_dword v[0:1], v2
1342 ; CI-LABEL: s_insertelement_v2i16_dynamic:
1344 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1345 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1346 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1347 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
1348 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
1349 ; CI-NEXT: v_mov_b32_e32 v0, s0
1350 ; CI-NEXT: v_mov_b32_e32 v1, s1
1351 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1352 ; CI-NEXT: s_lshl_b32 s0, s4, 4
1353 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1354 ; CI-NEXT: s_andn2_b32 s1, s2, s0
1355 ; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1356 ; CI-NEXT: s_or_b32 s0, s0, s1
1357 ; CI-NEXT: v_mov_b32_e32 v2, s0
1358 ; CI-NEXT: flat_store_dword v[0:1], v2
1361 ; GFX11-LABEL: s_insertelement_v2i16_dynamic:
1363 ; GFX11-NEXT: s_clause 0x1
1364 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
1365 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1366 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1367 ; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0
1368 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
1369 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1370 ; GFX11-NEXT: s_lshl_b32 s3, s4, 4
1371 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1372 ; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3
1373 ; GFX11-NEXT: s_and_not1_b32 s2, s2, s3
1374 ; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7
1375 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1376 ; GFX11-NEXT: s_or_b32 s2, s3, s2
1377 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1378 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1379 ; GFX11-NEXT: s_endpgm
1380 %idx = load volatile i32, ptr addrspace(4) %idx.ptr
1381 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
1382 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1383 store <2 x i16> %vecins, ptr addrspace(1) %out
1387 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
1388 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1390 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1391 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
1392 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1393 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1394 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1395 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1396 ; GFX9-NEXT: s_lshl_b32 s2, s4, 4
1397 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1398 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1399 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1400 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1401 ; GFX9-NEXT: s_endpgm
1403 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1405 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1406 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1407 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1409 ; VI-NEXT: v_mov_b32_e32 v1, s3
1410 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1411 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1412 ; VI-NEXT: flat_load_dword v3, v[0:1]
1413 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1414 ; VI-NEXT: s_lshl_b32 s0, s4, 4
1415 ; VI-NEXT: v_mov_b32_e32 v1, s1
1416 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1417 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1418 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1419 ; VI-NEXT: s_waitcnt vmcnt(0)
1420 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1421 ; VI-NEXT: flat_store_dword v[0:1], v2
1424 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1426 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1427 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
1428 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1429 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1430 ; CI-NEXT: v_mov_b32_e32 v1, s3
1431 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1432 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1433 ; CI-NEXT: flat_load_dword v3, v[0:1]
1434 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1435 ; CI-NEXT: s_lshl_b32 s0, s4, 4
1436 ; CI-NEXT: v_mov_b32_e32 v1, s1
1437 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1438 ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1439 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1440 ; CI-NEXT: s_waitcnt vmcnt(0)
1441 ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
1442 ; CI-NEXT: flat_store_dword v[0:1], v2
1445 ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1447 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1448 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1449 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
1450 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
1451 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1452 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1453 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1454 ; GFX11-NEXT: s_lshl_b32 s2, s4, 4
1455 ; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2
1456 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1457 ; GFX11-NEXT: v_bfi_b32 v1, s2, 0x3e703e7, v1
1458 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1459 ; GFX11-NEXT: s_endpgm
1460 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1461 %tid.ext = sext i32 %tid to i64
1462 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1463 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1464 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1465 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1466 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1470 define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
1471 ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1473 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1474 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1475 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1476 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1477 ; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
1478 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
1479 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1480 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1481 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1482 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
1483 ; GFX9-NEXT: s_mov_b32 s2, 0x12341234
1484 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1485 ; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2
1486 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1487 ; GFX9-NEXT: s_endpgm
1489 ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1491 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1492 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1493 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1494 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1495 ; VI-NEXT: v_mov_b32_e32 v3, s3
1496 ; VI-NEXT: v_mov_b32_e32 v1, s5
1497 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1498 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1499 ; VI-NEXT: flat_load_dword v4, v[0:1]
1500 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1501 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1502 ; VI-NEXT: flat_load_dword v3, v[0:1]
1503 ; VI-NEXT: s_mov_b32 s2, 0xffff
1504 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1505 ; VI-NEXT: v_mov_b32_e32 v1, s1
1506 ; VI-NEXT: s_mov_b32 s0, 0x12341234
1507 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1508 ; VI-NEXT: s_waitcnt vmcnt(1)
1509 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1510 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
1511 ; VI-NEXT: s_waitcnt vmcnt(0)
1512 ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
1513 ; VI-NEXT: flat_store_dword v[0:1], v2
1516 ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1518 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1519 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1520 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1521 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1522 ; CI-NEXT: v_mov_b32_e32 v3, s3
1523 ; CI-NEXT: v_mov_b32_e32 v1, s5
1524 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
1525 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1526 ; CI-NEXT: flat_load_dword v4, v[0:1]
1527 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1528 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1529 ; CI-NEXT: flat_load_dword v3, v[0:1]
1530 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1531 ; CI-NEXT: v_mov_b32_e32 v1, s1
1532 ; CI-NEXT: s_mov_b32 s0, 0x12341234
1533 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1534 ; CI-NEXT: s_waitcnt vmcnt(1)
1535 ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1536 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
1537 ; CI-NEXT: s_waitcnt vmcnt(0)
1538 ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
1539 ; CI-NEXT: flat_store_dword v[0:1], v2
1542 ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1544 ; GFX11-NEXT: s_clause 0x1
1545 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x10
1546 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1547 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1548 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1549 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1550 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1551 ; GFX11-NEXT: s_clause 0x1
1552 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1553 ; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
1554 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1555 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1556 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1557 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff
1558 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1559 ; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2
1560 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1561 ; GFX11-NEXT: s_endpgm
1562 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1563 %tid.ext = sext i32 %tid to i64
1564 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1565 %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
1566 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1567 %idx = load i32, ptr addrspace(1) %idx.gep
1568 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1569 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1570 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1574 define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1575 ; GFX9-LABEL: v_insertelement_v4f16_0:
1577 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1578 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
1579 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1580 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1581 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1582 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1583 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
1584 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1585 ; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0
1586 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1587 ; GFX9-NEXT: s_endpgm
1589 ; VI-LABEL: v_insertelement_v4f16_0:
1591 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1592 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
1593 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1594 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1595 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1596 ; VI-NEXT: v_mov_b32_e32 v1, s3
1597 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1598 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1599 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1600 ; VI-NEXT: v_mov_b32_e32 v3, s1
1601 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1602 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1603 ; VI-NEXT: s_waitcnt vmcnt(0)
1604 ; VI-NEXT: v_perm_b32 v0, s4, v0, v4
1605 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1608 ; CI-LABEL: v_insertelement_v4f16_0:
1610 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1611 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
1612 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1613 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1614 ; CI-NEXT: v_mov_b32_e32 v1, s3
1615 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1616 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1617 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1618 ; CI-NEXT: v_mov_b32_e32 v3, s1
1619 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1620 ; CI-NEXT: s_mov_b32 s0, 0xffff
1621 ; CI-NEXT: v_mov_b32_e32 v4, s4
1622 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1623 ; CI-NEXT: s_waitcnt vmcnt(0)
1624 ; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
1625 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1628 ; GFX11-LABEL: v_insertelement_v4f16_0:
1630 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1631 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1632 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
1633 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1634 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1635 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1636 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
1637 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1638 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
1639 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1640 ; GFX11-NEXT: s_endpgm
1641 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1642 %tid.ext = sext i32 %tid to i64
1643 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1644 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1645 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1646 %val.trunc = trunc i32 %val to i16
1647 %val.cvt = bitcast i16 %val.trunc to half
1648 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1649 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1653 define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1654 ; GFX9-LABEL: v_insertelement_v4f16_1:
1656 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1657 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
1658 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1659 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100
1660 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1661 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1662 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1663 ; GFX9-NEXT: v_perm_b32 v0, s4, v0, v3
1664 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1665 ; GFX9-NEXT: s_endpgm
1667 ; VI-LABEL: v_insertelement_v4f16_1:
1669 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1670 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1671 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1672 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
1673 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1674 ; VI-NEXT: v_mov_b32_e32 v1, s3
1675 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1676 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1677 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1678 ; VI-NEXT: v_mov_b32_e32 v3, s1
1679 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1680 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1681 ; VI-NEXT: s_waitcnt vmcnt(0)
1682 ; VI-NEXT: v_perm_b32 v0, v0, s4, v4
1683 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1686 ; CI-LABEL: v_insertelement_v4f16_1:
1688 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1689 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
1690 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1691 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1692 ; CI-NEXT: v_mov_b32_e32 v1, s3
1693 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1694 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1695 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1696 ; CI-NEXT: v_mov_b32_e32 v3, s1
1697 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1698 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1699 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1700 ; CI-NEXT: s_waitcnt vmcnt(0)
1701 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1702 ; CI-NEXT: v_or_b32_e32 v0, s0, v0
1703 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1706 ; GFX11-LABEL: v_insertelement_v4f16_1:
1708 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1709 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1710 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
1711 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1712 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1713 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1714 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
1715 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1716 ; GFX11-NEXT: v_perm_b32 v0, s4, v0, 0x5040100
1717 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1718 ; GFX11-NEXT: s_endpgm
1719 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1720 %tid.ext = sext i32 %tid to i64
1721 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1722 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1723 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1724 %val.trunc = trunc i32 %val to i16
1725 %val.cvt = bitcast i16 %val.trunc to half
1726 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1727 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1731 define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1732 ; GFX9-LABEL: v_insertelement_v4f16_2:
1734 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1735 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x30
1736 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1737 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1738 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1739 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1740 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
1741 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1742 ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1
1743 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1744 ; GFX9-NEXT: s_endpgm
1746 ; VI-LABEL: v_insertelement_v4f16_2:
1748 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1749 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
1750 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1751 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1752 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1753 ; VI-NEXT: v_mov_b32_e32 v1, s3
1754 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1755 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1756 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1757 ; VI-NEXT: v_mov_b32_e32 v3, s1
1758 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1759 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1760 ; VI-NEXT: s_waitcnt vmcnt(0)
1761 ; VI-NEXT: v_perm_b32 v1, s4, v1, v4
1762 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1765 ; CI-LABEL: v_insertelement_v4f16_2:
1767 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1768 ; CI-NEXT: s_load_dword s4, s[8:9], 0xc
1769 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1770 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1771 ; CI-NEXT: v_mov_b32_e32 v1, s3
1772 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1773 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1774 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1775 ; CI-NEXT: v_mov_b32_e32 v3, s1
1776 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1777 ; CI-NEXT: s_mov_b32 s0, 0xffff
1778 ; CI-NEXT: v_mov_b32_e32 v4, s4
1779 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1780 ; CI-NEXT: s_waitcnt vmcnt(0)
1781 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1782 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1785 ; GFX11-LABEL: v_insertelement_v4f16_2:
1787 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1788 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1789 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
1790 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1791 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1792 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1793 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
1794 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1795 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
1796 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1797 ; GFX11-NEXT: s_endpgm
1798 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1799 %tid.ext = sext i32 %tid to i64
1800 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1801 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1802 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1803 %val.trunc = trunc i32 %val to i16
1804 %val.cvt = bitcast i16 %val.trunc to half
1805 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1806 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1810 define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1811 ; GFX9-LABEL: v_insertelement_v4f16_3:
1813 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1814 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
1815 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1816 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100
1817 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1818 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1819 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1820 ; GFX9-NEXT: v_perm_b32 v1, s4, v1, v3
1821 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1822 ; GFX9-NEXT: s_endpgm
1824 ; VI-LABEL: v_insertelement_v4f16_3:
1826 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1827 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1828 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1829 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
1830 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1831 ; VI-NEXT: v_mov_b32_e32 v1, s3
1832 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1833 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1834 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1835 ; VI-NEXT: v_mov_b32_e32 v3, s1
1836 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1837 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1838 ; VI-NEXT: s_waitcnt vmcnt(0)
1839 ; VI-NEXT: v_perm_b32 v1, v1, s4, v4
1840 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1843 ; CI-LABEL: v_insertelement_v4f16_3:
1845 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1846 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
1847 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1848 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1849 ; CI-NEXT: v_mov_b32_e32 v1, s3
1850 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1851 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1852 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1853 ; CI-NEXT: v_mov_b32_e32 v3, s1
1854 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1855 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1856 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1857 ; CI-NEXT: s_waitcnt vmcnt(0)
1858 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1859 ; CI-NEXT: v_or_b32_e32 v1, s0, v1
1860 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1863 ; GFX11-LABEL: v_insertelement_v4f16_3:
1865 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1866 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1867 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
1868 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1869 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1870 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1871 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
1872 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1873 ; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
1874 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1875 ; GFX11-NEXT: s_endpgm
1876 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1877 %tid.ext = sext i32 %tid to i64
1878 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1879 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1880 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1881 %val.trunc = trunc i32 %val to i16
1882 %val.cvt = bitcast i16 %val.trunc to half
1883 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1884 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1888 define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1889 ; GFX9-LABEL: v_insertelement_v4i16_2:
1891 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1892 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
1893 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1894 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1895 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1896 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1897 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
1898 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1899 ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1
1900 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1901 ; GFX9-NEXT: s_endpgm
1903 ; VI-LABEL: v_insertelement_v4i16_2:
1905 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1906 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1907 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1908 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1909 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1910 ; VI-NEXT: v_mov_b32_e32 v1, s3
1911 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1912 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1913 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1914 ; VI-NEXT: v_mov_b32_e32 v3, s1
1915 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1916 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1917 ; VI-NEXT: s_waitcnt vmcnt(0)
1918 ; VI-NEXT: v_perm_b32 v1, s4, v1, v4
1919 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1922 ; CI-LABEL: v_insertelement_v4i16_2:
1924 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1925 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
1926 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1927 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1928 ; CI-NEXT: v_mov_b32_e32 v1, s3
1929 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1930 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1931 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1932 ; CI-NEXT: v_mov_b32_e32 v3, s1
1933 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1934 ; CI-NEXT: s_mov_b32 s0, 0xffff
1935 ; CI-NEXT: v_mov_b32_e32 v4, s4
1936 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1937 ; CI-NEXT: s_waitcnt vmcnt(0)
1938 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1939 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1942 ; GFX11-LABEL: v_insertelement_v4i16_2:
1944 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
1945 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1946 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
1947 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1948 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1949 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1950 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
1951 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1952 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
1953 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1954 ; GFX11-NEXT: s_endpgm
1955 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1956 %tid.ext = sext i32 %tid to i64
1957 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1958 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1959 %vec = load <4 x i16>, ptr addrspace(1) %in.gep
1960 %val.trunc = trunc i32 %val to i16
1961 %val.cvt = bitcast i16 %val.trunc to i16
1962 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1963 store <4 x i16> %vecins, ptr addrspace(1) %out.gep
1967 ; FIXME: Better code on CI?
1968 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1969 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1971 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1972 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
1973 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
1974 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1975 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1976 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1977 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1978 ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
1979 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
1980 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
1981 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
1982 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1983 ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1
1984 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0
1985 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1986 ; GFX9-NEXT: s_endpgm
1988 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1990 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1991 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1992 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
1993 ; VI-NEXT: s_waitcnt vmcnt(0)
1994 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1995 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1996 ; VI-NEXT: v_mov_b32_e32 v1, s3
1997 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1998 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1999 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2000 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff
2001 ; VI-NEXT: v_mov_b32_e32 v3, s1
2002 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2003 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
2004 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2005 ; VI-NEXT: s_or_b32 s0, s4, s1
2006 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2007 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
2008 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
2009 ; VI-NEXT: s_waitcnt vmcnt(0)
2010 ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
2011 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
2012 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2015 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2017 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2018 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
2019 ; CI-NEXT: flat_load_dword v4, v[0:1] glc
2020 ; CI-NEXT: s_waitcnt vmcnt(0)
2021 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2022 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2023 ; CI-NEXT: v_mov_b32_e32 v1, s3
2024 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
2025 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2026 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2027 ; CI-NEXT: s_mov_b64 s[2:3], 0xffff
2028 ; CI-NEXT: v_mov_b32_e32 v3, s1
2029 ; CI-NEXT: s_lshl_b32 s1, s4, 16
2030 ; CI-NEXT: s_and_b32 s4, s4, 0xffff
2031 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
2032 ; CI-NEXT: s_or_b32 s0, s4, s1
2033 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2034 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
2035 ; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
2036 ; CI-NEXT: s_waitcnt vmcnt(0)
2037 ; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
2038 ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
2039 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2042 ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2044 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2045 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc
2046 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2047 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2048 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
2049 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2050 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
2051 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2052 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3]
2053 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s4
2054 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
2055 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff
2056 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2057 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2058 ; GFX11-NEXT: v_bfi_b32 v1, v3, s2, v1
2059 ; GFX11-NEXT: v_bfi_b32 v0, v2, s2, v0
2060 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
2061 ; GFX11-NEXT: s_endpgm
2062 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2063 %tid.ext = sext i32 %tid to i64
2064 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2065 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2066 %idx.val = load volatile i32, ptr addrspace(1) undef
2067 %vec = load <4 x i16>, ptr addrspace(1) %in.gep
2068 %val.trunc = trunc i32 %val to i16
2069 %val.cvt = bitcast i16 %val.trunc to i16
2070 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
2071 store <4 x i16> %vecins, ptr addrspace(1) %out.gep
2075 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
2076 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2078 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2079 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2080 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2081 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2082 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
2083 ; GFX9-NEXT: s_lshl_b32 s2, s5, 4
2084 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4
2085 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
2086 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
2087 ; GFX9-NEXT: v_mov_b32_e32 v4, s4
2088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2089 ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1
2090 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0
2091 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
2092 ; GFX9-NEXT: s_endpgm
2094 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2096 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2097 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2098 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2099 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2100 ; VI-NEXT: v_mov_b32_e32 v1, s3
2101 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2102 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2103 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2104 ; VI-NEXT: v_mov_b32_e32 v3, s1
2105 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2106 ; VI-NEXT: s_and_b32 s2, s4, 0xffff
2107 ; VI-NEXT: s_lshl_b32 s3, s5, 4
2108 ; VI-NEXT: s_or_b32 s2, s2, s1
2109 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2110 ; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
2111 ; VI-NEXT: v_mov_b32_e32 v4, s2
2112 ; VI-NEXT: v_mov_b32_e32 v5, s2
2113 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2114 ; VI-NEXT: s_waitcnt vmcnt(0)
2115 ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1
2116 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0
2117 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2120 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2122 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2123 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
2124 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2125 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2126 ; CI-NEXT: v_mov_b32_e32 v1, s3
2127 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
2128 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2129 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2130 ; CI-NEXT: v_mov_b32_e32 v3, s1
2131 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
2132 ; CI-NEXT: s_lshl_b32 s2, s4, 16
2133 ; CI-NEXT: s_lshl_b32 s3, s5, 4
2134 ; CI-NEXT: s_or_b32 s2, s1, s2
2135 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
2136 ; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
2137 ; CI-NEXT: v_mov_b32_e32 v4, s2
2138 ; CI-NEXT: v_mov_b32_e32 v5, s2
2139 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2140 ; CI-NEXT: s_waitcnt vmcnt(0)
2141 ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1
2142 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0
2143 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2146 ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2148 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2149 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2150 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
2151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2152 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2153 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2154 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
2155 ; GFX11-NEXT: s_lshl_b32 s2, s5, 4
2156 ; GFX11-NEXT: s_pack_ll_b32_b16 s4, s4, s4
2157 ; GFX11-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
2158 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2159 ; GFX11-NEXT: v_bfi_b32 v1, s3, s4, v1
2160 ; GFX11-NEXT: v_bfi_b32 v0, s2, s4, v0
2161 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
2162 ; GFX11-NEXT: s_endpgm
2163 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2164 %tid.ext = sext i32 %tid to i64
2165 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
2166 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
2167 %vec = load <4 x half>, ptr addrspace(1) %in.gep
2168 %val.trunc = trunc i32 %val to i16
2169 %val.cvt = bitcast i16 %val.trunc to half
2170 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
2171 store <4 x half> %vecins, ptr addrspace(1) %out.gep
2175 define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2176 ; GFX9-LABEL: v_insertelement_v8f16_3:
2178 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2179 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
2180 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2181 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100
2182 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2183 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2184 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2185 ; GFX9-NEXT: v_perm_b32 v1, s4, v1, v5
2186 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2187 ; GFX9-NEXT: s_endpgm
2189 ; VI-LABEL: v_insertelement_v8f16_3:
2191 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2192 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
2193 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2194 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2195 ; VI-NEXT: v_mov_b32_e32 v1, s3
2196 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2197 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2198 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2199 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2200 ; VI-NEXT: s_lshl_b32 s0, s4, 16
2201 ; VI-NEXT: v_mov_b32_e32 v5, s1
2202 ; VI-NEXT: v_mov_b32_e32 v6, s0
2203 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2204 ; VI-NEXT: s_waitcnt vmcnt(0)
2205 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2206 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2209 ; CI-LABEL: v_insertelement_v8f16_3:
2211 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2212 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
2213 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2214 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2215 ; CI-NEXT: v_mov_b32_e32 v1, s3
2216 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2217 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2218 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2219 ; CI-NEXT: v_mov_b32_e32 v5, s1
2220 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2221 ; CI-NEXT: s_lshl_b32 s0, s4, 16
2222 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2223 ; CI-NEXT: s_waitcnt vmcnt(0)
2224 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
2225 ; CI-NEXT: v_or_b32_e32 v1, s0, v1
2226 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2229 ; GFX11-LABEL: v_insertelement_v8f16_3:
2231 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2232 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2233 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
2234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2235 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2236 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2237 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
2238 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2239 ; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
2240 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
2241 ; GFX11-NEXT: s_endpgm
2242 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2243 %tid.ext = sext i32 %tid to i64
2244 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2245 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2246 %vec = load <8 x half>, ptr addrspace(1) %in.gep
2247 %val.trunc = trunc i32 %val to i16
2248 %val.cvt = bitcast i16 %val.trunc to half
2249 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
2250 store <8 x half> %vecins, ptr addrspace(1) %out.gep
2254 define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2255 ; GFX9-LABEL: v_insertelement_v8i16_6:
2257 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2258 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
2259 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2261 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2262 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
2263 ; GFX9-NEXT: v_mov_b32_e32 v5, s4
2264 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2265 ; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3
2266 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2267 ; GFX9-NEXT: s_endpgm
2269 ; VI-LABEL: v_insertelement_v8i16_6:
2271 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2272 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
2273 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2274 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2275 ; VI-NEXT: v_mov_b32_e32 v1, s3
2276 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2277 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2278 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2279 ; VI-NEXT: v_mov_b32_e32 v5, s1
2280 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2281 ; VI-NEXT: s_mov_b32 s0, 0xffff
2282 ; VI-NEXT: v_mov_b32_e32 v6, s4
2283 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2284 ; VI-NEXT: s_waitcnt vmcnt(0)
2285 ; VI-NEXT: v_bfi_b32 v3, s0, v6, v3
2286 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2289 ; CI-LABEL: v_insertelement_v8i16_6:
2291 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2292 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
2293 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2294 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2295 ; CI-NEXT: v_mov_b32_e32 v1, s3
2296 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2297 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2298 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2299 ; CI-NEXT: v_mov_b32_e32 v5, s1
2300 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2301 ; CI-NEXT: s_mov_b32 s0, 0xffff
2302 ; CI-NEXT: v_mov_b32_e32 v6, s4
2303 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2304 ; CI-NEXT: s_waitcnt vmcnt(0)
2305 ; CI-NEXT: v_bfi_b32 v3, s0, v6, v3
2306 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2309 ; GFX11-LABEL: v_insertelement_v8i16_6:
2311 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2312 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2313 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
2314 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2315 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2316 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2317 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
2318 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2319 ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
2320 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
2321 ; GFX11-NEXT: s_endpgm
2322 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2323 %tid.ext = sext i32 %tid to i64
2324 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2325 %out.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2326 %vec = load <8 x i16>, ptr addrspace(1) %in.gep
2327 %val.trunc = trunc i32 %val to i16
2328 %val.cvt = bitcast i16 %val.trunc to i16
2329 %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
2330 store <8 x i16> %vecins, ptr addrspace(1) %out.gep
2334 define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2335 ; GFX9-LABEL: v_insertelement_v8f16_dynamic:
2337 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2338 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2339 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2340 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2341 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2342 ; GFX9-NEXT: s_cmp_eq_u32 s5, 6
2343 ; GFX9-NEXT: v_mov_b32_e32 v5, s4
2344 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2345 ; GFX9-NEXT: s_cmp_eq_u32 s5, 7
2346 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
2347 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2348 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc
2349 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2350 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2351 ; GFX9-NEXT: s_cmp_eq_u32 s5, 4
2352 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
2353 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2354 ; GFX9-NEXT: s_cmp_eq_u32 s5, 5
2355 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2
2356 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
2357 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2358 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2
2359 ; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2
2360 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc
2361 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2362 ; GFX9-NEXT: s_cmp_eq_u32 s5, 3
2363 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1
2364 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
2365 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2366 ; GFX9-NEXT: s_cmp_eq_u32 s5, 0
2367 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2
2368 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc
2369 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2370 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1
2371 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2372 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
2373 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2374 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
2375 ; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2
2376 ; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2
2377 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2378 ; GFX9-NEXT: s_endpgm
2380 ; VI-LABEL: v_insertelement_v8f16_dynamic:
2382 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2383 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2384 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2385 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2386 ; VI-NEXT: v_mov_b32_e32 v1, s3
2387 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2388 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2389 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2390 ; VI-NEXT: v_mov_b32_e32 v5, s1
2391 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2392 ; VI-NEXT: s_cmp_eq_u32 s5, 6
2393 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2394 ; VI-NEXT: v_mov_b32_e32 v6, s4
2395 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2396 ; VI-NEXT: s_cmp_eq_u32 s5, 7
2397 ; VI-NEXT: s_waitcnt vmcnt(0)
2398 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
2399 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2400 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2401 ; VI-NEXT: s_cmp_eq_u32 s5, 4
2402 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
2403 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2404 ; VI-NEXT: s_cmp_eq_u32 s5, 5
2405 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2406 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
2407 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2408 ; VI-NEXT: s_cmp_eq_u32 s5, 2
2409 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2410 ; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
2411 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2412 ; VI-NEXT: s_cmp_eq_u32 s5, 3
2413 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2414 ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2415 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2416 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2417 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2418 ; VI-NEXT: s_cmp_eq_u32 s5, 0
2419 ; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2420 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
2421 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2422 ; VI-NEXT: s_cmp_eq_u32 s5, 1
2423 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
2424 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2425 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2426 ; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
2427 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2428 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2429 ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2430 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2431 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2434 ; CI-LABEL: v_insertelement_v8f16_dynamic:
2436 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2437 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
2438 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2439 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2440 ; CI-NEXT: v_mov_b32_e32 v1, s3
2441 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2442 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2443 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2444 ; CI-NEXT: v_mov_b32_e32 v5, s1
2445 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2446 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
2447 ; CI-NEXT: s_cmp_eq_u32 s5, 7
2448 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2449 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2450 ; CI-NEXT: s_cmp_eq_u32 s5, 6
2451 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2452 ; CI-NEXT: s_cmp_eq_u32 s5, 5
2453 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
2454 ; CI-NEXT: s_cmp_eq_u32 s5, 4
2455 ; CI-NEXT: s_waitcnt vmcnt(0)
2456 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
2457 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2458 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2459 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
2460 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
2461 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2462 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2463 ; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
2464 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
2465 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
2466 ; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2467 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2468 ; CI-NEXT: s_cmp_eq_u32 s5, 3
2469 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2470 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2471 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
2472 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2473 ; CI-NEXT: s_cmp_eq_u32 s5, 2
2474 ; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc
2475 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2476 ; CI-NEXT: s_cmp_eq_u32 s5, 1
2477 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2478 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2479 ; CI-NEXT: s_cmp_eq_u32 s5, 0
2480 ; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3]
2481 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2482 ; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
2483 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2484 ; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2485 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2486 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
2487 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
2488 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
2489 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2490 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2491 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2492 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2493 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
2494 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2495 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2496 ; CI-NEXT: v_or_b32_e32 v3, v3, v6
2497 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
2498 ; CI-NEXT: v_or_b32_e32 v2, v2, v7
2499 ; CI-NEXT: v_or_b32_e32 v1, v1, v8
2500 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
2501 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2504 ; GFX11-LABEL: v_insertelement_v8f16_dynamic:
2506 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2507 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2508 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
2509 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2510 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2511 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2512 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
2513 ; GFX11-NEXT: s_cmp_eq_u32 s5, 6
2514 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2515 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7
2516 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2517 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2
2518 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2519 ; GFX11-NEXT: s_cmp_eq_u32 s5, 4
2520 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2521 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
2522 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5
2523 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2524 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
2525 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
2526 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2
2527 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
2528 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
2529 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2530 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3
2531 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
2532 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2533 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0
2534 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0
2535 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
2536 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2537 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1
2538 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
2539 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2540 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3
2541 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2
2542 ; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100
2543 ; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
2544 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2545 ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
2546 ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
2547 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
2548 ; GFX11-NEXT: s_endpgm
2549 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2550 %tid.ext = sext i32 %tid to i64
2551 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2552 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2553 %vec = load <8 x half>, ptr addrspace(1) %in.gep
2554 %val.trunc = trunc i32 %val to i16
2555 %val.cvt = bitcast i16 %val.trunc to half
2556 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
2557 store <8 x half> %vecins, ptr addrspace(1) %out.gep
2561 define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2562 ; GFX9-LABEL: v_insertelement_v16f16_3:
2564 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2565 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
2566 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2567 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100
2568 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2569 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
2570 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2571 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2572 ; GFX9-NEXT: v_perm_b32 v1, s4, v1, v9
2573 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2574 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2575 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
2576 ; GFX9-NEXT: s_endpgm
2578 ; VI-LABEL: v_insertelement_v16f16_3:
2580 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2581 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
2582 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2583 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2584 ; VI-NEXT: v_mov_b32_e32 v1, s3
2585 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
2586 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2587 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
2588 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2589 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2590 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2591 ; VI-NEXT: v_mov_b32_e32 v9, s1
2592 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2593 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2594 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2595 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2596 ; VI-NEXT: v_mov_b32_e32 v12, s1
2597 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2598 ; VI-NEXT: s_waitcnt vmcnt(1)
2599 ; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2600 ; VI-NEXT: s_waitcnt vmcnt(0)
2601 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2602 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2605 ; CI-LABEL: v_insertelement_v16f16_3:
2607 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2608 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
2609 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2610 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2611 ; CI-NEXT: v_mov_b32_e32 v0, s3
2612 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
2613 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
2614 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
2615 ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
2616 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2617 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2618 ; CI-NEXT: v_mov_b32_e32 v9, s1
2619 ; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8
2620 ; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2621 ; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8
2622 ; CI-NEXT: s_lshl_b32 s1, s4, 16
2623 ; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2624 ; CI-NEXT: s_waitcnt vmcnt(1)
2625 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
2626 ; CI-NEXT: v_or_b32_e32 v1, s1, v1
2627 ; CI-NEXT: s_waitcnt vmcnt(0)
2628 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2629 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2632 ; GFX11-LABEL: v_insertelement_v16f16_3:
2634 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2635 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2636 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
2637 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2638 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2639 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2640 ; GFX11-NEXT: s_clause 0x1
2641 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
2642 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
2643 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2644 ; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
2645 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2646 ; GFX11-NEXT: s_clause 0x1
2647 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
2648 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
2649 ; GFX11-NEXT: s_endpgm
2650 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2651 %tid.ext = sext i32 %tid to i64
2652 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
2653 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
2654 %vec = load <16 x half>, ptr addrspace(1) %in.gep
2655 %val.trunc = trunc i32 %val to i16
2656 %val.cvt = bitcast i16 %val.trunc to half
2657 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3
2658 store <16 x half> %vecins, ptr addrspace(1) %out.gep
2662 define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2663 ; GFX9-LABEL: v_insertelement_v16i16_6:
2665 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2666 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
2667 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2668 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2669 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
2670 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2671 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
2672 ; GFX9-NEXT: v_mov_b32_e32 v9, s4
2673 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2674 ; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3
2675 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2676 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2677 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
2678 ; GFX9-NEXT: s_endpgm
2680 ; VI-LABEL: v_insertelement_v16i16_6:
2682 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2683 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
2684 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2685 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
2686 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2687 ; VI-NEXT: v_mov_b32_e32 v1, s3
2688 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
2689 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2690 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
2691 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2692 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2693 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2694 ; VI-NEXT: v_mov_b32_e32 v9, s1
2695 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2696 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2697 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2698 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2699 ; VI-NEXT: s_waitcnt vmcnt(1)
2700 ; VI-NEXT: v_perm_b32 v3, s4, v3, v12
2701 ; VI-NEXT: s_waitcnt vmcnt(0)
2702 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2703 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2706 ; CI-LABEL: v_insertelement_v16i16_6:
2708 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2709 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
2710 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2711 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2712 ; CI-NEXT: v_mov_b32_e32 v1, s3
2713 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
2714 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2715 ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
2716 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2717 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2718 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2719 ; CI-NEXT: v_mov_b32_e32 v9, s1
2720 ; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8
2721 ; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2722 ; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8
2723 ; CI-NEXT: s_mov_b32 s2, 0xffff
2724 ; CI-NEXT: v_mov_b32_e32 v12, s4
2725 ; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2726 ; CI-NEXT: s_waitcnt vmcnt(1)
2727 ; CI-NEXT: v_bfi_b32 v3, s2, v12, v3
2728 ; CI-NEXT: s_waitcnt vmcnt(0)
2729 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2730 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2733 ; GFX11-LABEL: v_insertelement_v16i16_6:
2735 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
2736 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2737 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
2738 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2739 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2740 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2741 ; GFX11-NEXT: s_clause 0x1
2742 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
2743 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
2744 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2745 ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
2746 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2747 ; GFX11-NEXT: s_clause 0x1
2748 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
2749 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
2750 ; GFX11-NEXT: s_endpgm
2751 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2752 %tid.ext = sext i32 %tid to i64
2753 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2754 %out.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2755 %vec = load <16 x i16>, ptr addrspace(1) %in.gep
2756 %val.trunc = trunc i32 %val to i16
2757 %val.cvt = bitcast i16 %val.trunc to i16
2758 %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6
2759 store <16 x i16> %vecins, ptr addrspace(1) %out.gep
2763 define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2764 ; GFX9-LABEL: v_insertelement_v16f16_dynamic:
2766 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2767 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2768 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0
2769 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2770 ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
2771 ; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
2772 ; GFX9-NEXT: s_cmp_eq_u32 s5, 6
2773 ; GFX9-NEXT: v_mov_b32_e32 v9, s4
2774 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2775 ; GFX9-NEXT: s_cmp_eq_u32 s5, 7
2776 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
2777 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2778 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc
2779 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2780 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2781 ; GFX9-NEXT: s_cmp_eq_u32 s5, 4
2782 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
2783 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2784 ; GFX9-NEXT: s_cmp_eq_u32 s5, 5
2785 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
2786 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
2787 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2788 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2
2789 ; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2
2790 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc
2791 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2792 ; GFX9-NEXT: s_cmp_eq_u32 s5, 3
2793 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
2794 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
2795 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2796 ; GFX9-NEXT: s_cmp_eq_u32 s5, 0
2797 ; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2
2798 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
2799 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2800 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1
2801 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2802 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
2803 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2804 ; GFX9-NEXT: s_cmp_eq_u32 s5, 14
2805 ; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2
2806 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
2807 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2808 ; GFX9-NEXT: s_cmp_eq_u32 s5, 15
2809 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2810 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8
2811 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
2812 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2813 ; GFX9-NEXT: s_cmp_eq_u32 s5, 12
2814 ; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2
2815 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
2816 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2817 ; GFX9-NEXT: s_cmp_eq_u32 s5, 13
2818 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
2819 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
2820 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2821 ; GFX9-NEXT: s_cmp_eq_u32 s5, 10
2822 ; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2
2823 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
2824 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2825 ; GFX9-NEXT: s_cmp_eq_u32 s5, 11
2826 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
2827 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
2828 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2829 ; GFX9-NEXT: s_cmp_eq_u32 s5, 8
2830 ; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2
2831 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
2832 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2833 ; GFX9-NEXT: s_cmp_eq_u32 s5, 9
2834 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5
2835 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
2836 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2837 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
2838 ; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2
2839 ; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2
2840 ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
2841 ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
2842 ; GFX9-NEXT: s_endpgm
2844 ; VI-LABEL: v_insertelement_v16f16_dynamic:
2846 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2847 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
2848 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2849 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2850 ; VI-NEXT: v_mov_b32_e32 v0, s3
2851 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
2852 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
2853 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
2854 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
2855 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2856 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2857 ; VI-NEXT: v_mov_b32_e32 v9, s1
2858 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2859 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2860 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2861 ; VI-NEXT: s_cmp_eq_u32 s5, 14
2862 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2863 ; VI-NEXT: v_mov_b32_e32 v12, s4
2864 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2865 ; VI-NEXT: s_cmp_eq_u32 s5, 15
2866 ; VI-NEXT: s_waitcnt vmcnt(1)
2867 ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc
2868 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2869 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2870 ; VI-NEXT: s_cmp_eq_u32 s5, 12
2871 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
2872 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2873 ; VI-NEXT: s_cmp_eq_u32 s5, 13
2874 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
2875 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
2876 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2877 ; VI-NEXT: s_cmp_eq_u32 s5, 10
2878 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2879 ; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
2880 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2881 ; VI-NEXT: s_cmp_eq_u32 s5, 11
2882 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
2883 ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2884 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
2885 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
2886 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2887 ; VI-NEXT: s_cmp_eq_u32 s5, 8
2888 ; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2889 ; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc
2890 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2891 ; VI-NEXT: s_cmp_eq_u32 s5, 9
2892 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
2893 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2894 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
2895 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2896 ; VI-NEXT: s_cmp_eq_u32 s5, 6
2897 ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2898 ; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc
2899 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2900 ; VI-NEXT: s_cmp_eq_u32 s5, 7
2901 ; VI-NEXT: s_waitcnt vmcnt(0)
2902 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
2903 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2904 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
2905 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2906 ; VI-NEXT: s_cmp_eq_u32 s5, 4
2907 ; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2908 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc
2909 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2910 ; VI-NEXT: s_cmp_eq_u32 s5, 5
2911 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6
2912 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2913 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
2914 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2915 ; VI-NEXT: s_cmp_eq_u32 s5, 2
2916 ; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2917 ; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc
2918 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2919 ; VI-NEXT: s_cmp_eq_u32 s5, 3
2920 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
2921 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2922 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
2923 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2924 ; VI-NEXT: s_cmp_eq_u32 s5, 0
2925 ; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2926 ; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc
2927 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2928 ; VI-NEXT: s_cmp_eq_u32 s5, 1
2929 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4
2930 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
2931 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2932 ; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
2933 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2934 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
2935 ; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2936 ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2937 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
2938 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
2941 ; CI-LABEL: v_insertelement_v16f16_dynamic:
2943 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
2944 ; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
2945 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
2946 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2947 ; CI-NEXT: v_mov_b32_e32 v1, s3
2948 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2949 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2950 ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
2951 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2952 ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
2953 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2954 ; CI-NEXT: v_mov_b32_e32 v5, s1
2955 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2956 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
2957 ; CI-NEXT: s_cmp_eq_u32 s5, 15
2958 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2959 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2960 ; CI-NEXT: s_cmp_eq_u32 s5, 14
2961 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2962 ; CI-NEXT: s_cmp_eq_u32 s5, 13
2963 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
2964 ; CI-NEXT: s_cmp_eq_u32 s5, 12
2965 ; CI-NEXT: s_waitcnt vmcnt(1)
2966 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
2967 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
2968 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
2969 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
2970 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
2971 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
2972 ; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
2973 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
2974 ; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1]
2975 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2976 ; CI-NEXT: s_cmp_eq_u32 s5, 11
2977 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
2978 ; CI-NEXT: s_waitcnt vmcnt(0)
2979 ; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
2980 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
2981 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2982 ; CI-NEXT: s_cmp_eq_u32 s5, 10
2983 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
2984 ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
2985 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2986 ; CI-NEXT: s_cmp_eq_u32 s5, 9
2987 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2988 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
2989 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
2990 ; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
2991 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2992 ; CI-NEXT: s_cmp_eq_u32 s5, 8
2993 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
2994 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
2995 ; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
2996 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2997 ; CI-NEXT: s_cmp_eq_u32 s5, 7
2998 ; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
2999 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3000 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
3001 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3002 ; CI-NEXT: s_cmp_eq_u32 s5, 6
3003 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
3004 ; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc
3005 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3006 ; CI-NEXT: s_cmp_eq_u32 s5, 5
3007 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
3008 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
3009 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
3010 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3011 ; CI-NEXT: s_cmp_eq_u32 s5, 4
3012 ; CI-NEXT: v_or_b32_e32 v10, v10, v11
3013 ; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc
3014 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3015 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3016 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
3017 ; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3]
3018 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3019 ; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
3020 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3021 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
3022 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
3023 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
3024 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
3025 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
3026 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
3027 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
3028 ; CI-NEXT: v_or_b32_e32 v2, v2, v11
3029 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
3030 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
3031 ; CI-NEXT: s_cmp_eq_u32 s5, 3
3032 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
3033 ; CI-NEXT: v_or_b32_e32 v9, v9, v12
3034 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14
3035 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3036 ; CI-NEXT: s_cmp_eq_u32 s5, 2
3037 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
3038 ; CI-NEXT: v_or_b32_e32 v7, v7, v12
3039 ; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc
3040 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3041 ; CI-NEXT: s_cmp_eq_u32 s5, 1
3042 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3043 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3044 ; CI-NEXT: s_cmp_eq_u32 s5, 0
3045 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
3046 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3047 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
3048 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3049 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
3050 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
3051 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3052 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3053 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
3054 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3055 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
3056 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
3057 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
3058 ; CI-NEXT: v_or_b32_e32 v8, v8, v13
3059 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15
3060 ; CI-NEXT: v_or_b32_e32 v1, v1, v6
3061 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
3062 ; CI-NEXT: v_or_b32_e32 v3, v3, v13
3063 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
3064 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3066 ; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4
3067 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
3068 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
3071 ; GFX11-LABEL: v_insertelement_v16f16_dynamic:
3073 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
3074 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
3075 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
3076 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3077 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
3078 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3079 ; GFX11-NEXT: s_clause 0x1
3080 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
3081 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
3082 ; GFX11-NEXT: s_cmp_eq_u32 s5, 6
3083 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3084 ; GFX11-NEXT: s_cmp_eq_u32 s5, 7
3085 ; GFX11-NEXT: s_waitcnt vmcnt(1)
3086 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2
3087 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3088 ; GFX11-NEXT: s_cmp_eq_u32 s5, 4
3089 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3090 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
3091 ; GFX11-NEXT: s_cmp_eq_u32 s5, 5
3092 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2
3093 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
3094 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
3095 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2
3096 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1
3097 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
3098 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3099 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3
3100 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
3101 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3102 ; GFX11-NEXT: s_cmp_eq_u32 s5, 0
3103 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0
3104 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2
3105 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3106 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1
3107 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
3108 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3109 ; GFX11-NEXT: s_cmp_eq_u32 s5, 14
3110 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3111 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7
3112 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3
3113 ; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
3114 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2
3115 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3116 ; GFX11-NEXT: s_cmp_eq_u32 s5, 15
3117 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
3118 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3119 ; GFX11-NEXT: s_cmp_eq_u32 s5, 12
3120 ; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6
3121 ; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
3122 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2
3123 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3124 ; GFX11-NEXT: s_cmp_eq_u32 s5, 13
3125 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2
3126 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3127 ; GFX11-NEXT: s_cmp_eq_u32 s5, 10
3128 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5
3129 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2
3130 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3131 ; GFX11-NEXT: s_cmp_eq_u32 s5, 11
3132 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2
3133 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3134 ; GFX11-NEXT: s_cmp_eq_u32 s5, 8
3135 ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4
3136 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2
3137 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3138 ; GFX11-NEXT: s_cmp_eq_u32 s5, 9
3139 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2
3140 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3141 ; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100
3142 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2
3143 ; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100
3144 ; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
3145 ; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
3146 ; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
3147 ; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100
3148 ; GFX11-NEXT: s_clause 0x1
3149 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
3150 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
3151 ; GFX11-NEXT: s_endpgm
3152 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
3153 %tid.ext = sext i32 %tid to i64
3154 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
3155 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
3156 %vec = load <16 x half>, ptr addrspace(1) %in.gep
3157 %val.trunc = trunc i32 %val to i16
3158 %val.cvt = bitcast i16 %val.trunc to half
3159 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n
3160 store <16 x half> %vecins, ptr addrspace(1) %out.gep
3165 declare i32 @llvm.amdgcn.workitem.id.x() #1
3167 attributes #0 = { nounwind }
3168 attributes #1 = { nounwind readnone }