1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
5 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
7 define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
8 ; GFX9-LABEL: s_insertelement_v2i16_0:
10 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
12 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
16 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
17 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
20 ; CIVI-LABEL: s_insertelement_v2i16_0:
22 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
23 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
24 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
25 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
26 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
27 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
28 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
29 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
30 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
31 ; CIVI-NEXT: flat_store_dword v[0:1], v2
34 ; GFX11-LABEL: s_insertelement_v2i16_0:
36 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
37 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
38 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
39 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
41 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
42 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
43 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
45 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
46 ; GFX11-NEXT: s_endpgm
47 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
48 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
49 store <2 x i16> %vecins, ptr addrspace(1) %out
54 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
55 ; GFX9-LABEL: s_insertelement_v2i16_0_reg:
57 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
58 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
59 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
60 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
62 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
63 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2
64 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
65 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
68 ; VI-LABEL: s_insertelement_v2i16_0_reg:
70 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
71 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
72 ; VI-NEXT: s_waitcnt lgkmcnt(0)
73 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
74 ; VI-NEXT: v_mov_b32_e32 v0, s0
75 ; VI-NEXT: v_mov_b32_e32 v1, s1
76 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
77 ; VI-NEXT: s_waitcnt lgkmcnt(0)
78 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
79 ; VI-NEXT: s_or_b32 s0, s0, s1
80 ; VI-NEXT: v_mov_b32_e32 v2, s0
81 ; VI-NEXT: flat_store_dword v[0:1], v2
84 ; CI-LABEL: s_insertelement_v2i16_0_reg:
86 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
87 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
88 ; CI-NEXT: s_waitcnt lgkmcnt(0)
89 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
90 ; CI-NEXT: v_mov_b32_e32 v0, s0
91 ; CI-NEXT: v_mov_b32_e32 v1, s1
92 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
93 ; CI-NEXT: s_waitcnt lgkmcnt(0)
94 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
95 ; CI-NEXT: s_or_b32 s0, s1, s0
96 ; CI-NEXT: v_mov_b32_e32 v2, s0
97 ; CI-NEXT: flat_store_dword v[0:1], v2
100 ; GFX11-LABEL: s_insertelement_v2i16_0_reg:
102 ; GFX11-NEXT: s_clause 0x1
103 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
104 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
105 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
107 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
108 ; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1
109 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
110 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
111 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
112 ; GFX11-NEXT: s_nop 0
113 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
114 ; GFX11-NEXT: s_endpgm
115 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
116 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
117 store <2 x i16> %vecins, ptr addrspace(1) %out
121 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
122 ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
124 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
125 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
126 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
127 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
129 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
130 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
131 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2
132 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
133 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
134 ; GFX9-NEXT: ;;#ASMSTART
135 ; GFX9-NEXT: ; use s2
136 ; GFX9-NEXT: ;;#ASMEND
137 ; GFX9-NEXT: s_endpgm
139 ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
141 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
142 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
143 ; VI-NEXT: s_waitcnt lgkmcnt(0)
144 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
145 ; VI-NEXT: v_mov_b32_e32 v0, s0
146 ; VI-NEXT: v_mov_b32_e32 v1, s1
147 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
148 ; VI-NEXT: s_waitcnt lgkmcnt(0)
149 ; VI-NEXT: s_lshr_b32 s1, s2, 16
150 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
151 ; VI-NEXT: s_or_b32 s0, s0, s2
152 ; VI-NEXT: v_mov_b32_e32 v2, s0
153 ; VI-NEXT: flat_store_dword v[0:1], v2
154 ; VI-NEXT: ;;#ASMSTART
159 ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
161 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
162 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
163 ; CI-NEXT: s_waitcnt lgkmcnt(0)
164 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
165 ; CI-NEXT: v_mov_b32_e32 v0, s0
166 ; CI-NEXT: v_mov_b32_e32 v1, s1
167 ; CI-NEXT: s_and_b32 s0, s4, 0xffff
168 ; CI-NEXT: s_waitcnt lgkmcnt(0)
169 ; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
170 ; CI-NEXT: s_or_b32 s0, s0, s1
171 ; CI-NEXT: v_mov_b32_e32 v2, s0
172 ; CI-NEXT: s_lshr_b32 s2, s2, 16
173 ; CI-NEXT: flat_store_dword v[0:1], v2
174 ; CI-NEXT: ;;#ASMSTART
179 ; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
181 ; GFX11-NEXT: s_clause 0x1
182 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
183 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
184 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
185 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
186 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
187 ; GFX11-NEXT: s_lshr_b32 s1, s1, 16
188 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
189 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
190 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
191 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
192 ; GFX11-NEXT: ;;#ASMSTART
193 ; GFX11-NEXT: ; use s1
194 ; GFX11-NEXT: ;;#ASMEND
195 ; GFX11-NEXT: s_nop 0
196 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
197 ; GFX11-NEXT: s_endpgm
198 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
199 %elt1 = extractelement <2 x i16> %vec, i32 1
200 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
201 store <2 x i16> %vecins, ptr addrspace(1) %out
202 %use1 = zext i16 %elt1 to i32
203 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
207 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
208 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
210 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
211 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
212 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
213 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
215 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2
217 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
218 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
219 ; GFX9-NEXT: s_endpgm
221 ; VI-LABEL: s_insertelement_v2i16_0_reghi:
223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
224 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
226 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
227 ; VI-NEXT: v_mov_b32_e32 v0, s0
228 ; VI-NEXT: v_mov_b32_e32 v2, s4
229 ; VI-NEXT: v_mov_b32_e32 v1, s1
230 ; VI-NEXT: s_waitcnt lgkmcnt(0)
231 ; VI-NEXT: s_lshr_b32 s0, s2, 16
232 ; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16
233 ; VI-NEXT: flat_store_dword v[0:1], v2
236 ; CI-LABEL: s_insertelement_v2i16_0_reghi:
238 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
239 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
240 ; CI-NEXT: s_waitcnt lgkmcnt(0)
241 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
242 ; CI-NEXT: v_mov_b32_e32 v0, s0
243 ; CI-NEXT: v_mov_b32_e32 v2, s4
244 ; CI-NEXT: v_mov_b32_e32 v1, s1
245 ; CI-NEXT: s_waitcnt lgkmcnt(0)
246 ; CI-NEXT: s_lshr_b32 s0, s2, 16
247 ; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16
248 ; CI-NEXT: flat_store_dword v[0:1], v2
251 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi:
253 ; GFX11-NEXT: s_clause 0x1
254 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
255 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
256 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
257 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
258 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX11-NEXT: s_pack_hh_b32_b16 s0, s0, s1
260 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
261 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
262 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
263 ; GFX11-NEXT: s_nop 0
264 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
265 ; GFX11-NEXT: s_endpgm
266 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
267 %elt.hi = lshr i32 %elt.arg, 16
268 %elt = trunc i32 %elt.hi to i16
269 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
270 store <2 x i16> %vecins, ptr addrspace(1) %out
274 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
275 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
277 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
278 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
279 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
280 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
282 ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2
285 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
286 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
287 ; GFX9-NEXT: ;;#ASMSTART
288 ; GFX9-NEXT: ; use s3
289 ; GFX9-NEXT: ;;#ASMEND
290 ; GFX9-NEXT: s_endpgm
292 ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
294 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
295 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
296 ; VI-NEXT: s_waitcnt lgkmcnt(0)
297 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
298 ; VI-NEXT: v_mov_b32_e32 v1, s1
299 ; VI-NEXT: v_mov_b32_e32 v2, s4
300 ; VI-NEXT: v_mov_b32_e32 v0, s0
301 ; VI-NEXT: s_lshr_b32 s0, s4, 16
302 ; VI-NEXT: s_waitcnt lgkmcnt(0)
303 ; VI-NEXT: s_lshr_b32 s1, s2, 16
304 ; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
305 ; VI-NEXT: flat_store_dword v[0:1], v2
306 ; VI-NEXT: ;;#ASMSTART
311 ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
313 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
314 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
315 ; CI-NEXT: s_waitcnt lgkmcnt(0)
316 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
317 ; CI-NEXT: v_mov_b32_e32 v1, s1
318 ; CI-NEXT: v_mov_b32_e32 v2, s4
319 ; CI-NEXT: v_mov_b32_e32 v0, s0
320 ; CI-NEXT: s_lshr_b32 s0, s4, 16
321 ; CI-NEXT: s_waitcnt lgkmcnt(0)
322 ; CI-NEXT: s_lshr_b32 s1, s2, 16
323 ; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
324 ; CI-NEXT: flat_store_dword v[0:1], v2
325 ; CI-NEXT: ;;#ASMSTART
330 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
332 ; GFX11-NEXT: s_clause 0x1
333 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
334 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
335 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
336 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
337 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
338 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX11-NEXT: s_pack_lh_b32_b16 s1, s0, s1
340 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
341 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
342 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
343 ; GFX11-NEXT: ;;#ASMSTART
344 ; GFX11-NEXT: ; use s0
345 ; GFX11-NEXT: ;;#ASMEND
346 ; GFX11-NEXT: s_nop 0
347 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
348 ; GFX11-NEXT: s_endpgm
349 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
350 %elt.hi = lshr i32 %elt.arg, 16
351 %elt = trunc i32 %elt.hi to i16
352 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
353 store <2 x i16> %vecins, ptr addrspace(1) %out
354 %use1 = zext i16 %elt to i32
355 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
359 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %elt.arg) #0 {
360 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
362 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
363 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
364 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
365 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
367 ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
368 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
369 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
370 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2
371 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
372 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
373 ; GFX9-NEXT: ;;#ASMSTART
374 ; GFX9-NEXT: ; use s3
375 ; GFX9-NEXT: ;;#ASMEND
376 ; GFX9-NEXT: ;;#ASMSTART
377 ; GFX9-NEXT: ; use s2
378 ; GFX9-NEXT: ;;#ASMEND
379 ; GFX9-NEXT: s_endpgm
381 ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
383 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
384 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
385 ; VI-NEXT: s_waitcnt lgkmcnt(0)
386 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
387 ; VI-NEXT: v_mov_b32_e32 v1, s1
388 ; VI-NEXT: v_mov_b32_e32 v2, s4
389 ; VI-NEXT: v_mov_b32_e32 v0, s0
390 ; VI-NEXT: s_lshr_b32 s0, s4, 16
391 ; VI-NEXT: s_waitcnt lgkmcnt(0)
392 ; VI-NEXT: s_lshr_b32 s1, s2, 16
393 ; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
394 ; VI-NEXT: flat_store_dword v[0:1], v2
395 ; VI-NEXT: ;;#ASMSTART
398 ; VI-NEXT: ;;#ASMSTART
403 ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
405 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
406 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
407 ; CI-NEXT: s_waitcnt lgkmcnt(0)
408 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
409 ; CI-NEXT: v_mov_b32_e32 v1, s1
410 ; CI-NEXT: v_mov_b32_e32 v2, s4
411 ; CI-NEXT: v_mov_b32_e32 v0, s0
412 ; CI-NEXT: s_lshr_b32 s0, s4, 16
413 ; CI-NEXT: s_waitcnt lgkmcnt(0)
414 ; CI-NEXT: s_lshr_b32 s1, s2, 16
415 ; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
416 ; CI-NEXT: flat_store_dword v[0:1], v2
417 ; CI-NEXT: ;;#ASMSTART
420 ; CI-NEXT: ;;#ASMSTART
425 ; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
427 ; GFX11-NEXT: s_clause 0x1
428 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
429 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
430 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
432 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
433 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX11-NEXT: s_lshr_b32 s1, s1, 16
435 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
436 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s1
437 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
438 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
439 ; GFX11-NEXT: ;;#ASMSTART
440 ; GFX11-NEXT: ; use s0
441 ; GFX11-NEXT: ;;#ASMEND
442 ; GFX11-NEXT: ;;#ASMSTART
443 ; GFX11-NEXT: ; use s1
444 ; GFX11-NEXT: ;;#ASMEND
445 ; GFX11-NEXT: s_nop 0
446 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
447 ; GFX11-NEXT: s_endpgm
448 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
449 %elt.hi = lshr i32 %elt.arg, 16
450 %elt = trunc i32 %elt.hi to i16
451 %vec.hi = extractelement <2 x i16> %vec, i32 1
452 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
453 store <2 x i16> %vecins, ptr addrspace(1) %out
454 %use1 = zext i16 %elt to i32
455 %vec.hi.use1 = zext i16 %vec.hi to i32
457 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
458 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
462 define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
463 ; GFX9-LABEL: s_insertelement_v2i16_1:
465 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
466 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
467 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
469 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
471 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
472 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
473 ; GFX9-NEXT: s_endpgm
475 ; CIVI-LABEL: s_insertelement_v2i16_1:
477 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
478 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
479 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
480 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
481 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
482 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
483 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
484 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000
485 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
486 ; CIVI-NEXT: flat_store_dword v[0:1], v2
487 ; CIVI-NEXT: s_endpgm
489 ; GFX11-LABEL: s_insertelement_v2i16_1:
491 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
492 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
493 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
494 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
496 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
497 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
498 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
499 ; GFX11-NEXT: s_nop 0
500 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
501 ; GFX11-NEXT: s_endpgm
502 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
503 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
504 store <2 x i16> %vecins, ptr addrspace(1) %out
508 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i16 %elt) #0 {
509 ; GFX9-LABEL: s_insertelement_v2i16_1_reg:
511 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
512 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
513 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
514 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
516 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
517 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6
518 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
519 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
520 ; GFX9-NEXT: s_endpgm
522 ; VI-LABEL: s_insertelement_v2i16_1_reg:
524 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
525 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
526 ; VI-NEXT: s_waitcnt lgkmcnt(0)
527 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
528 ; VI-NEXT: v_mov_b32_e32 v0, s0
529 ; VI-NEXT: v_mov_b32_e32 v1, s1
530 ; VI-NEXT: s_lshl_b32 s0, s4, 16
531 ; VI-NEXT: s_waitcnt lgkmcnt(0)
532 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
533 ; VI-NEXT: s_or_b32 s0, s1, s0
534 ; VI-NEXT: v_mov_b32_e32 v2, s0
535 ; VI-NEXT: flat_store_dword v[0:1], v2
538 ; CI-LABEL: s_insertelement_v2i16_1_reg:
540 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
541 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
542 ; CI-NEXT: s_waitcnt lgkmcnt(0)
543 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
544 ; CI-NEXT: v_mov_b32_e32 v0, s0
545 ; CI-NEXT: v_mov_b32_e32 v1, s1
546 ; CI-NEXT: s_lshl_b32 s1, s4, 16
547 ; CI-NEXT: s_waitcnt lgkmcnt(0)
548 ; CI-NEXT: s_and_b32 s0, s2, 0xffff
549 ; CI-NEXT: s_or_b32 s0, s0, s1
550 ; CI-NEXT: v_mov_b32_e32 v2, s0
551 ; CI-NEXT: flat_store_dword v[0:1], v2
554 ; GFX11-LABEL: s_insertelement_v2i16_1_reg:
556 ; GFX11-NEXT: s_clause 0x1
557 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
558 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
559 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
560 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
561 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0
563 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
564 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
565 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
566 ; GFX11-NEXT: s_nop 0
567 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
568 ; GFX11-NEXT: s_endpgm
569 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
570 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
571 store <2 x i16> %vecins, ptr addrspace(1) %out
575 define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
576 ; GFX9-LABEL: s_insertelement_v2f16_0:
578 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
579 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
580 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
581 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
582 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
584 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
585 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
586 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
587 ; GFX9-NEXT: s_endpgm
589 ; CIVI-LABEL: s_insertelement_v2f16_0:
591 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
592 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
593 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
594 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
595 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
596 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
597 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
598 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
599 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
600 ; CIVI-NEXT: flat_store_dword v[0:1], v2
601 ; CIVI-NEXT: s_endpgm
603 ; GFX11-LABEL: s_insertelement_v2f16_0:
605 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
606 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
608 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
609 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
610 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
611 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
612 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
613 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
614 ; GFX11-NEXT: s_nop 0
615 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
616 ; GFX11-NEXT: s_endpgm
617 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
618 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
619 store <2 x half> %vecins, ptr addrspace(1) %out
623 define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
624 ; GFX9-LABEL: s_insertelement_v2f16_1:
626 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
627 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
628 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
629 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
630 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
631 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
632 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
633 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
634 ; GFX9-NEXT: s_endpgm
636 ; CIVI-LABEL: s_insertelement_v2f16_1:
638 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
639 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
640 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
641 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
642 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
643 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
644 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
645 ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000
646 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
647 ; CIVI-NEXT: flat_store_dword v[0:1], v2
648 ; CIVI-NEXT: s_endpgm
650 ; GFX11-LABEL: s_insertelement_v2f16_1:
652 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
653 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
655 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
656 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
657 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
658 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
659 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
660 ; GFX11-NEXT: s_nop 0
661 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
662 ; GFX11-NEXT: s_endpgm
663 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
664 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
665 store <2 x half> %vecins, ptr addrspace(1) %out
669 define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
670 ; GFX9-LABEL: v_insertelement_v2i16_0:
672 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
673 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
674 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e7
675 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
676 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
677 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
678 ; GFX9-NEXT: s_waitcnt vmcnt(0)
679 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
680 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
681 ; GFX9-NEXT: s_endpgm
683 ; VI-LABEL: v_insertelement_v2i16_0:
685 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
686 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
687 ; VI-NEXT: s_waitcnt lgkmcnt(0)
688 ; VI-NEXT: v_mov_b32_e32 v1, s3
689 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
690 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
691 ; VI-NEXT: flat_load_dword v3, v[0:1]
692 ; VI-NEXT: v_mov_b32_e32 v1, s1
693 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
694 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
695 ; VI-NEXT: s_waitcnt vmcnt(0)
696 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
697 ; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
698 ; VI-NEXT: flat_store_dword v[0:1], v2
701 ; CI-LABEL: v_insertelement_v2i16_0:
703 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
704 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
705 ; CI-NEXT: s_waitcnt lgkmcnt(0)
706 ; CI-NEXT: v_mov_b32_e32 v1, s3
707 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
708 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
709 ; CI-NEXT: flat_load_dword v3, v[0:1]
710 ; CI-NEXT: v_mov_b32_e32 v1, s1
711 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
712 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
713 ; CI-NEXT: s_waitcnt vmcnt(0)
714 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
715 ; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
716 ; CI-NEXT: flat_store_dword v[0:1], v2
719 ; GFX11-LABEL: v_insertelement_v2i16_0:
721 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
722 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
723 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
724 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
725 ; GFX11-NEXT: s_movk_i32 s2, 0x3e7
726 ; GFX11-NEXT: s_waitcnt vmcnt(0)
727 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
728 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
729 ; GFX11-NEXT: s_nop 0
730 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
731 ; GFX11-NEXT: s_endpgm
732 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
733 %tid.ext = sext i32 %tid to i64
734 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
735 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
736 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
737 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
738 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
742 define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %elt.arg) #0 {
743 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
745 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
746 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
747 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
748 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x7060302
749 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
750 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
751 ; GFX9-NEXT: s_waitcnt vmcnt(0)
752 ; GFX9-NEXT: v_perm_b32 v1, v1, s6, v2
753 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
754 ; GFX9-NEXT: s_endpgm
756 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
758 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
759 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
760 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
762 ; VI-NEXT: v_mov_b32_e32 v1, s3
763 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
764 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
765 ; VI-NEXT: flat_load_dword v3, v[0:1]
766 ; VI-NEXT: v_mov_b32_e32 v1, s1
767 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
768 ; VI-NEXT: v_mov_b32_e32 v2, 0x3020706
769 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
770 ; VI-NEXT: s_waitcnt vmcnt(0)
771 ; VI-NEXT: v_perm_b32 v2, s4, v3, v2
772 ; VI-NEXT: flat_store_dword v[0:1], v2
775 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
777 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
778 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
779 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
780 ; CI-NEXT: s_waitcnt lgkmcnt(0)
781 ; CI-NEXT: v_mov_b32_e32 v1, s3
782 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
783 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
784 ; CI-NEXT: flat_load_dword v3, v[0:1]
785 ; CI-NEXT: v_mov_b32_e32 v1, s1
786 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
787 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
788 ; CI-NEXT: s_waitcnt vmcnt(0)
789 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
790 ; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16
791 ; CI-NEXT: flat_store_dword v[0:1], v2
794 ; GFX11-LABEL: v_insertelement_v2i16_0_reghi:
796 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
797 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
798 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
799 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
800 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
801 ; GFX11-NEXT: s_waitcnt vmcnt(0)
802 ; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302
803 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
804 ; GFX11-NEXT: s_nop 0
805 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
806 ; GFX11-NEXT: s_endpgm
807 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
808 %tid.ext = sext i32 %tid to i64
809 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
810 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
811 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
812 %elt.hi = lshr i32 %elt.arg, 16
813 %elt = trunc i32 %elt.hi to i16
814 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
815 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
819 define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
820 ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
822 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
823 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
824 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
825 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
826 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
827 ; GFX9-NEXT: s_waitcnt vmcnt(0)
828 ; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1
829 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
830 ; GFX9-NEXT: s_endpgm
832 ; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
834 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
835 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
836 ; VI-NEXT: s_waitcnt lgkmcnt(0)
837 ; VI-NEXT: v_mov_b32_e32 v1, s3
838 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
839 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
840 ; VI-NEXT: flat_load_dword v3, v[0:1]
841 ; VI-NEXT: v_mov_b32_e32 v1, s1
842 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
843 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
844 ; VI-NEXT: s_waitcnt vmcnt(0)
845 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
846 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
847 ; VI-NEXT: flat_store_dword v[0:1], v2
850 ; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
852 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
853 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
854 ; CI-NEXT: s_waitcnt lgkmcnt(0)
855 ; CI-NEXT: v_mov_b32_e32 v1, s3
856 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
857 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
858 ; CI-NEXT: flat_load_dword v3, v[0:1]
859 ; CI-NEXT: v_mov_b32_e32 v1, s1
860 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
861 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
862 ; CI-NEXT: s_waitcnt vmcnt(0)
863 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
864 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
865 ; CI-NEXT: flat_store_dword v[0:1], v2
868 ; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm:
870 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
871 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
872 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
873 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
874 ; GFX11-NEXT: s_waitcnt vmcnt(0)
875 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
876 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
877 ; GFX11-NEXT: s_nop 0
878 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
879 ; GFX11-NEXT: s_endpgm
880 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
881 %tid.ext = sext i32 %tid to i64
882 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
883 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
884 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
885 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
886 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
890 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
891 define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
892 ; GFX9-LABEL: v_insertelement_v2i16_1:
894 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
895 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
896 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
897 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
898 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
899 ; GFX9-NEXT: s_movk_i32 s2, 0x3e7
900 ; GFX9-NEXT: s_waitcnt vmcnt(0)
901 ; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2
902 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
903 ; GFX9-NEXT: s_endpgm
905 ; VI-LABEL: v_insertelement_v2i16_1:
907 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
908 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
909 ; VI-NEXT: s_waitcnt lgkmcnt(0)
910 ; VI-NEXT: v_mov_b32_e32 v1, s3
911 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
912 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
913 ; VI-NEXT: flat_load_dword v3, v[0:1]
914 ; VI-NEXT: v_mov_b32_e32 v1, s1
915 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
916 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e70000
917 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
918 ; VI-NEXT: s_waitcnt vmcnt(0)
919 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
920 ; VI-NEXT: flat_store_dword v[0:1], v2
923 ; CI-LABEL: v_insertelement_v2i16_1:
925 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
926 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
927 ; CI-NEXT: s_waitcnt lgkmcnt(0)
928 ; CI-NEXT: v_mov_b32_e32 v1, s3
929 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
930 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
931 ; CI-NEXT: flat_load_dword v3, v[0:1]
932 ; CI-NEXT: v_mov_b32_e32 v1, s1
933 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
934 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
935 ; CI-NEXT: s_waitcnt vmcnt(0)
936 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
937 ; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2
938 ; CI-NEXT: flat_store_dword v[0:1], v2
941 ; GFX11-LABEL: v_insertelement_v2i16_1:
943 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
944 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
945 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
946 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
947 ; GFX11-NEXT: s_movk_i32 s2, 0x3e7
948 ; GFX11-NEXT: s_waitcnt vmcnt(0)
949 ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
950 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
951 ; GFX11-NEXT: s_nop 0
952 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
953 ; GFX11-NEXT: s_endpgm
954 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
955 %tid.ext = sext i32 %tid to i64
956 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
957 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
958 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
959 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
960 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
964 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
965 ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
967 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
968 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
969 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
970 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
971 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
972 ; GFX9-NEXT: s_waitcnt vmcnt(0)
973 ; GFX9-NEXT: v_perm_b32 v1, -15, v1, v2
974 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
975 ; GFX9-NEXT: s_endpgm
977 ; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
979 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
980 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
981 ; VI-NEXT: s_waitcnt lgkmcnt(0)
982 ; VI-NEXT: v_mov_b32_e32 v1, s3
983 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
984 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
985 ; VI-NEXT: flat_load_dword v3, v[0:1]
986 ; VI-NEXT: v_mov_b32_e32 v1, s1
987 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
988 ; VI-NEXT: v_mov_b32_e32 v2, 0xfff10000
989 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
990 ; VI-NEXT: s_waitcnt vmcnt(0)
991 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
992 ; VI-NEXT: flat_store_dword v[0:1], v2
995 ; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
997 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
998 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
999 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1000 ; CI-NEXT: v_mov_b32_e32 v1, s3
1001 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1002 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1003 ; CI-NEXT: flat_load_dword v3, v[0:1]
1004 ; CI-NEXT: v_mov_b32_e32 v1, s1
1005 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1006 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1007 ; CI-NEXT: s_waitcnt vmcnt(0)
1008 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1009 ; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2
1010 ; CI-NEXT: flat_store_dword v[0:1], v2
1013 ; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm:
1015 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1016 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1017 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1019 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1020 ; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100
1021 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1022 ; GFX11-NEXT: s_nop 0
1023 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1024 ; GFX11-NEXT: s_endpgm
1025 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1026 %tid.ext = sext i32 %tid to i64
1027 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1028 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1029 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1030 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
1031 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1035 define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1036 ; GFX9-LABEL: v_insertelement_v2f16_0:
1038 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1039 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1040 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500
1041 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1042 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1043 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1044 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1045 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1046 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1047 ; GFX9-NEXT: s_endpgm
1049 ; VI-LABEL: v_insertelement_v2f16_0:
1051 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1052 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1053 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1054 ; VI-NEXT: v_mov_b32_e32 v1, s3
1055 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1056 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1057 ; VI-NEXT: flat_load_dword v3, v[0:1]
1058 ; VI-NEXT: v_mov_b32_e32 v1, s1
1059 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1060 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1061 ; VI-NEXT: s_waitcnt vmcnt(0)
1062 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1063 ; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2
1064 ; VI-NEXT: flat_store_dword v[0:1], v2
1067 ; CI-LABEL: v_insertelement_v2f16_0:
1069 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1070 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1071 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1072 ; CI-NEXT: v_mov_b32_e32 v1, s3
1073 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1074 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1075 ; CI-NEXT: flat_load_dword v3, v[0:1]
1076 ; CI-NEXT: v_mov_b32_e32 v1, s1
1077 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1078 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1079 ; CI-NEXT: s_waitcnt vmcnt(0)
1080 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1081 ; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2
1082 ; CI-NEXT: flat_store_dword v[0:1], v2
1085 ; GFX11-LABEL: v_insertelement_v2f16_0:
1087 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1088 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1089 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1090 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1091 ; GFX11-NEXT: s_movk_i32 s2, 0x4500
1092 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1
1094 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1095 ; GFX11-NEXT: s_nop 0
1096 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1097 ; GFX11-NEXT: s_endpgm
1098 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1099 %tid.ext = sext i32 %tid to i64
1100 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1101 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1102 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1103 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
1104 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1108 define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1109 ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
1111 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1112 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1113 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1114 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1115 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1116 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1117 ; GFX9-NEXT: v_bfi_b32 v1, s2, 53, v1
1118 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1119 ; GFX9-NEXT: s_endpgm
1121 ; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
1123 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1124 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1126 ; VI-NEXT: v_mov_b32_e32 v1, s3
1127 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1128 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1129 ; VI-NEXT: flat_load_dword v3, v[0:1]
1130 ; VI-NEXT: v_mov_b32_e32 v1, s1
1131 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1132 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1133 ; VI-NEXT: s_waitcnt vmcnt(0)
1134 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1135 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
1136 ; VI-NEXT: flat_store_dword v[0:1], v2
1139 ; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
1141 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1142 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1143 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1144 ; CI-NEXT: v_mov_b32_e32 v1, s3
1145 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1146 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1147 ; CI-NEXT: flat_load_dword v3, v[0:1]
1148 ; CI-NEXT: v_mov_b32_e32 v1, s1
1149 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1150 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1151 ; CI-NEXT: s_waitcnt vmcnt(0)
1152 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
1153 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
1154 ; CI-NEXT: flat_store_dword v[0:1], v2
1157 ; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm:
1159 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1160 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1161 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1162 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1163 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1164 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1
1165 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1166 ; GFX11-NEXT: s_nop 0
1167 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1168 ; GFX11-NEXT: s_endpgm
1169 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1170 %tid.ext = sext i32 %tid to i64
1171 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1172 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1173 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1174 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
1175 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1179 define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1180 ; GFX9-LABEL: v_insertelement_v2f16_1:
1182 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1183 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1184 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
1185 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1186 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1187 ; GFX9-NEXT: s_movk_i32 s2, 0x4500
1188 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1189 ; GFX9-NEXT: v_perm_b32 v1, s2, v1, v2
1190 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1191 ; GFX9-NEXT: s_endpgm
1193 ; VI-LABEL: v_insertelement_v2f16_1:
1195 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1196 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1197 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1198 ; VI-NEXT: v_mov_b32_e32 v1, s3
1199 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1200 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1201 ; VI-NEXT: flat_load_dword v3, v[0:1]
1202 ; VI-NEXT: v_mov_b32_e32 v1, s1
1203 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1204 ; VI-NEXT: v_mov_b32_e32 v2, 0x45000000
1205 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1206 ; VI-NEXT: s_waitcnt vmcnt(0)
1207 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1208 ; VI-NEXT: flat_store_dword v[0:1], v2
1211 ; CI-LABEL: v_insertelement_v2f16_1:
1213 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1214 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1215 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1216 ; CI-NEXT: v_mov_b32_e32 v1, s3
1217 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1218 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1219 ; CI-NEXT: flat_load_dword v3, v[0:1]
1220 ; CI-NEXT: v_mov_b32_e32 v1, s1
1221 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1222 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1223 ; CI-NEXT: s_waitcnt vmcnt(0)
1224 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1225 ; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2
1226 ; CI-NEXT: flat_store_dword v[0:1], v2
1229 ; GFX11-LABEL: v_insertelement_v2f16_1:
1231 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1232 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1233 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1234 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1235 ; GFX11-NEXT: s_movk_i32 s2, 0x4500
1236 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1237 ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
1238 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1239 ; GFX11-NEXT: s_nop 0
1240 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1241 ; GFX11-NEXT: s_endpgm
1242 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1243 %tid.ext = sext i32 %tid to i64
1244 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1245 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1246 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1247 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1248 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1252 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1253 ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1255 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1256 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1257 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x5040100
1258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1259 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1261 ; GFX9-NEXT: v_perm_b32 v1, 35, v1, v2
1262 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1263 ; GFX9-NEXT: s_endpgm
1265 ; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1268 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1269 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1270 ; VI-NEXT: v_mov_b32_e32 v1, s3
1271 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1272 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1273 ; VI-NEXT: flat_load_dword v3, v[0:1]
1274 ; VI-NEXT: v_mov_b32_e32 v1, s1
1275 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1276 ; VI-NEXT: v_mov_b32_e32 v2, 0x230000
1277 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1278 ; VI-NEXT: s_waitcnt vmcnt(0)
1279 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1280 ; VI-NEXT: flat_store_dword v[0:1], v2
1283 ; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1285 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1286 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1287 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1288 ; CI-NEXT: v_mov_b32_e32 v1, s3
1289 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1290 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1291 ; CI-NEXT: flat_load_dword v3, v[0:1]
1292 ; CI-NEXT: v_mov_b32_e32 v1, s1
1293 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1294 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1295 ; CI-NEXT: s_waitcnt vmcnt(0)
1296 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1297 ; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2
1298 ; CI-NEXT: flat_store_dword v[0:1], v2
1301 ; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm:
1303 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1304 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1305 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1306 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
1307 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1308 ; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100
1309 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1310 ; GFX11-NEXT: s_nop 0
1311 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1312 ; GFX11-NEXT: s_endpgm
1313 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1314 %tid.ext = sext i32 %tid to i64
1315 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1316 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1317 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1318 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1319 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1323 ; FIXME: Enable for others when argument load not split
1324 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(4) %idx.ptr) #0 {
1325 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1327 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1328 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1329 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1330 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1331 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1332 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
1333 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1334 ; GFX9-NEXT: s_lshl_b32 s2, s4, 4
1335 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1336 ; GFX9-NEXT: s_andn2_b32 s3, s5, s2
1337 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7
1338 ; GFX9-NEXT: s_or_b32 s2, s2, s3
1339 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1340 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1341 ; GFX9-NEXT: s_endpgm
1343 ; VI-LABEL: s_insertelement_v2i16_dynamic:
1345 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1346 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1347 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1348 ; VI-NEXT: s_load_dword s4, s[6:7], 0x0
1349 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1350 ; VI-NEXT: v_mov_b32_e32 v0, s0
1351 ; VI-NEXT: v_mov_b32_e32 v1, s1
1352 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1353 ; VI-NEXT: s_lshl_b32 s0, s4, 4
1354 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1355 ; VI-NEXT: s_andn2_b32 s1, s2, s0
1356 ; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1357 ; VI-NEXT: s_or_b32 s0, s0, s1
1358 ; VI-NEXT: v_mov_b32_e32 v2, s0
1359 ; VI-NEXT: flat_store_dword v[0:1], v2
1362 ; CI-LABEL: s_insertelement_v2i16_dynamic:
1364 ; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4
1365 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1366 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1367 ; CI-NEXT: s_load_dword s4, s[6:7], 0x0
1368 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
1369 ; CI-NEXT: v_mov_b32_e32 v0, s0
1370 ; CI-NEXT: v_mov_b32_e32 v1, s1
1371 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1372 ; CI-NEXT: s_lshl_b32 s0, s4, 4
1373 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1374 ; CI-NEXT: s_andn2_b32 s1, s2, s0
1375 ; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1376 ; CI-NEXT: s_or_b32 s0, s0, s1
1377 ; CI-NEXT: v_mov_b32_e32 v2, s0
1378 ; CI-NEXT: flat_store_dword v[0:1], v2
1381 ; GFX11-LABEL: s_insertelement_v2i16_dynamic:
1383 ; GFX11-NEXT: s_clause 0x1
1384 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10
1385 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1386 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1387 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
1388 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
1389 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1390 ; GFX11-NEXT: s_lshl_b32 s3, s4, 4
1391 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1392 ; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3
1393 ; GFX11-NEXT: s_and_not1_b32 s2, s2, s3
1394 ; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7
1395 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1396 ; GFX11-NEXT: s_or_b32 s2, s3, s2
1397 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1398 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1399 ; GFX11-NEXT: s_nop 0
1400 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1401 ; GFX11-NEXT: s_endpgm
1402 %idx = load volatile i32, ptr addrspace(4) %idx.ptr
1403 %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
1404 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1405 store <2 x i16> %vecins, ptr addrspace(1) %out
1409 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
1410 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1412 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1413 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1414 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1415 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1416 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1417 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1418 ; GFX9-NEXT: s_lshl_b32 s2, s6, 4
1419 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1421 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1422 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1423 ; GFX9-NEXT: s_endpgm
1425 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1427 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1428 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1429 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1431 ; VI-NEXT: v_mov_b32_e32 v1, s3
1432 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1433 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1434 ; VI-NEXT: flat_load_dword v3, v[0:1]
1435 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1436 ; VI-NEXT: s_lshl_b32 s0, s4, 4
1437 ; VI-NEXT: v_mov_b32_e32 v1, s1
1438 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1439 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1440 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1441 ; VI-NEXT: s_waitcnt vmcnt(0)
1442 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1443 ; VI-NEXT: flat_store_dword v[0:1], v2
1446 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1448 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1449 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1450 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1451 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1452 ; CI-NEXT: v_mov_b32_e32 v1, s3
1453 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1454 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1455 ; CI-NEXT: flat_load_dword v3, v[0:1]
1456 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1457 ; CI-NEXT: s_lshl_b32 s0, s4, 4
1458 ; CI-NEXT: v_mov_b32_e32 v1, s1
1459 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1460 ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1461 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1462 ; CI-NEXT: s_waitcnt vmcnt(0)
1463 ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
1464 ; CI-NEXT: flat_store_dword v[0:1], v2
1467 ; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1469 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1470 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1471 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
1472 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1473 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1474 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4
1475 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1476 ; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0
1477 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1478 ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1
1479 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1480 ; GFX11-NEXT: s_nop 0
1481 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1482 ; GFX11-NEXT: s_endpgm
1483 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1484 %tid.ext = sext i32 %tid to i64
1485 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1486 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1487 %vec = load <2 x i16>, ptr addrspace(1) %in.gep
1488 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1489 store <2 x i16> %vecins, ptr addrspace(1) %out.gep
1493 define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
1494 ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1496 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1497 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1498 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1499 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1500 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1501 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
1502 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1503 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1504 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1505 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
1506 ; GFX9-NEXT: s_mov_b32 s2, 0x12341234
1507 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1508 ; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2
1509 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1510 ; GFX9-NEXT: s_endpgm
1512 ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1514 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1515 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1516 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1517 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1518 ; VI-NEXT: v_mov_b32_e32 v3, s3
1519 ; VI-NEXT: v_mov_b32_e32 v1, s5
1520 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1521 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1522 ; VI-NEXT: flat_load_dword v4, v[0:1]
1523 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1524 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1525 ; VI-NEXT: flat_load_dword v3, v[0:1]
1526 ; VI-NEXT: s_mov_b32 s2, 0xffff
1527 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1528 ; VI-NEXT: v_mov_b32_e32 v1, s1
1529 ; VI-NEXT: s_mov_b32 s0, 0x12341234
1530 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1531 ; VI-NEXT: s_waitcnt vmcnt(1)
1532 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1533 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
1534 ; VI-NEXT: s_waitcnt vmcnt(0)
1535 ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
1536 ; VI-NEXT: flat_store_dword v[0:1], v2
1539 ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1541 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1542 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1543 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1544 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1545 ; CI-NEXT: v_mov_b32_e32 v3, s3
1546 ; CI-NEXT: v_mov_b32_e32 v1, s5
1547 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
1548 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1549 ; CI-NEXT: flat_load_dword v4, v[0:1]
1550 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1551 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1552 ; CI-NEXT: flat_load_dword v3, v[0:1]
1553 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1554 ; CI-NEXT: v_mov_b32_e32 v1, s1
1555 ; CI-NEXT: s_mov_b32 s0, 0x12341234
1556 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1557 ; CI-NEXT: s_waitcnt vmcnt(1)
1558 ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1559 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
1560 ; CI-NEXT: s_waitcnt vmcnt(0)
1561 ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
1562 ; CI-NEXT: flat_store_dword v[0:1], v2
1565 ; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1567 ; GFX11-NEXT: s_clause 0x1
1568 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10
1569 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1570 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1571 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1572 ; GFX11-NEXT: s_clause 0x1
1573 ; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
1574 ; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
1575 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1576 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1577 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1578 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff
1579 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1580 ; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2
1581 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1582 ; GFX11-NEXT: s_nop 0
1583 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1584 ; GFX11-NEXT: s_endpgm
1585 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1586 %tid.ext = sext i32 %tid to i64
1587 %in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
1588 %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
1589 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext
1590 %idx = load i32, ptr addrspace(1) %idx.gep
1591 %vec = load <2 x half>, ptr addrspace(1) %in.gep
1592 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1593 store <2 x half> %vecins, ptr addrspace(1) %out.gep
1597 define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1598 ; GFX9-LABEL: v_insertelement_v4f16_0:
1600 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1601 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
1602 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1603 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1604 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1605 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1606 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
1607 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1608 ; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0
1609 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1610 ; GFX9-NEXT: s_endpgm
1612 ; VI-LABEL: v_insertelement_v4f16_0:
1614 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1615 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1616 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1617 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1618 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1619 ; VI-NEXT: v_mov_b32_e32 v1, s3
1620 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1621 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1622 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1623 ; VI-NEXT: v_mov_b32_e32 v3, s1
1624 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1625 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1626 ; VI-NEXT: s_waitcnt vmcnt(0)
1627 ; VI-NEXT: v_perm_b32 v0, s4, v0, v4
1628 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1631 ; CI-LABEL: v_insertelement_v4f16_0:
1633 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1634 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1635 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1636 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1637 ; CI-NEXT: v_mov_b32_e32 v1, s3
1638 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1639 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1640 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1641 ; CI-NEXT: v_mov_b32_e32 v3, s1
1642 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1643 ; CI-NEXT: s_mov_b32 s0, 0xffff
1644 ; CI-NEXT: v_mov_b32_e32 v4, s4
1645 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1646 ; CI-NEXT: s_waitcnt vmcnt(0)
1647 ; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
1648 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1651 ; GFX11-LABEL: v_insertelement_v4f16_0:
1653 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1654 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1655 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
1656 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1657 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
1658 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1659 ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0
1660 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1661 ; GFX11-NEXT: s_nop 0
1662 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1663 ; GFX11-NEXT: s_endpgm
1664 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1665 %tid.ext = sext i32 %tid to i64
1666 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1667 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1668 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1669 %val.trunc = trunc i32 %val to i16
1670 %val.cvt = bitcast i16 %val.trunc to half
1671 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1672 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1676 define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1677 ; GFX9-LABEL: v_insertelement_v4f16_1:
1679 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1680 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1681 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1682 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100
1683 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1684 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1685 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1686 ; GFX9-NEXT: v_perm_b32 v0, s6, v0, v3
1687 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1688 ; GFX9-NEXT: s_endpgm
1690 ; VI-LABEL: v_insertelement_v4f16_1:
1692 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1693 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1694 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1695 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
1696 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1697 ; VI-NEXT: v_mov_b32_e32 v1, s3
1698 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1699 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1700 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1701 ; VI-NEXT: v_mov_b32_e32 v3, s1
1702 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1703 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1704 ; VI-NEXT: s_waitcnt vmcnt(0)
1705 ; VI-NEXT: v_perm_b32 v0, v0, s4, v4
1706 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1709 ; CI-LABEL: v_insertelement_v4f16_1:
1711 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1712 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1713 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1714 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1715 ; CI-NEXT: v_mov_b32_e32 v1, s3
1716 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1717 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1718 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1719 ; CI-NEXT: v_mov_b32_e32 v3, s1
1720 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1721 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1722 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1723 ; CI-NEXT: s_waitcnt vmcnt(0)
1724 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1725 ; CI-NEXT: v_or_b32_e32 v0, s0, v0
1726 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1729 ; GFX11-LABEL: v_insertelement_v4f16_1:
1731 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1732 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1733 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
1734 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1735 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
1736 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1737 ; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100
1738 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1739 ; GFX11-NEXT: s_nop 0
1740 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1741 ; GFX11-NEXT: s_endpgm
1742 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1743 %tid.ext = sext i32 %tid to i64
1744 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1745 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1746 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1747 %val.trunc = trunc i32 %val to i16
1748 %val.cvt = bitcast i16 %val.trunc to half
1749 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1750 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1754 define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
1755 ; GFX9-LABEL: v_insertelement_v4f16_2:
1757 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1758 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
1759 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1760 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1761 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1762 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1763 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
1764 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1765 ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1
1766 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1767 ; GFX9-NEXT: s_endpgm
1769 ; VI-LABEL: v_insertelement_v4f16_2:
1771 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1772 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1773 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1774 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1775 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1776 ; VI-NEXT: v_mov_b32_e32 v1, s3
1777 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1778 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1779 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1780 ; VI-NEXT: v_mov_b32_e32 v3, s1
1781 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1782 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1783 ; VI-NEXT: s_waitcnt vmcnt(0)
1784 ; VI-NEXT: v_perm_b32 v1, s4, v1, v4
1785 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1788 ; CI-LABEL: v_insertelement_v4f16_2:
1790 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1791 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1792 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1793 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1794 ; CI-NEXT: v_mov_b32_e32 v1, s3
1795 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1796 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1797 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1798 ; CI-NEXT: v_mov_b32_e32 v3, s1
1799 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1800 ; CI-NEXT: s_mov_b32 s0, 0xffff
1801 ; CI-NEXT: v_mov_b32_e32 v4, s4
1802 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1803 ; CI-NEXT: s_waitcnt vmcnt(0)
1804 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1805 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1808 ; GFX11-LABEL: v_insertelement_v4f16_2:
1810 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1811 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1812 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30
1813 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1814 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
1815 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1816 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1
1817 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1818 ; GFX11-NEXT: s_nop 0
1819 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1820 ; GFX11-NEXT: s_endpgm
1821 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1822 %tid.ext = sext i32 %tid to i64
1823 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1824 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1825 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1826 %val.trunc = trunc i32 %val to i16
1827 %val.cvt = bitcast i16 %val.trunc to half
1828 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1829 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1833 define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1834 ; GFX9-LABEL: v_insertelement_v4f16_3:
1836 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1837 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1838 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1839 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x5040100
1840 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1841 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1842 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1843 ; GFX9-NEXT: v_perm_b32 v1, s6, v1, v3
1844 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1845 ; GFX9-NEXT: s_endpgm
1847 ; VI-LABEL: v_insertelement_v4f16_3:
1849 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1850 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1851 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1852 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
1853 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1854 ; VI-NEXT: v_mov_b32_e32 v1, s3
1855 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1856 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1857 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1858 ; VI-NEXT: v_mov_b32_e32 v3, s1
1859 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1860 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1861 ; VI-NEXT: s_waitcnt vmcnt(0)
1862 ; VI-NEXT: v_perm_b32 v1, v1, s4, v4
1863 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1866 ; CI-LABEL: v_insertelement_v4f16_3:
1868 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1869 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1870 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1871 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1872 ; CI-NEXT: v_mov_b32_e32 v1, s3
1873 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1874 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1875 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1876 ; CI-NEXT: v_mov_b32_e32 v3, s1
1877 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1878 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1879 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1880 ; CI-NEXT: s_waitcnt vmcnt(0)
1881 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1882 ; CI-NEXT: v_or_b32_e32 v1, s0, v1
1883 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1886 ; GFX11-LABEL: v_insertelement_v4f16_3:
1888 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1889 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1890 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
1891 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1892 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
1893 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1894 ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100
1895 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1896 ; GFX11-NEXT: s_nop 0
1897 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1898 ; GFX11-NEXT: s_endpgm
1899 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1900 %tid.ext = sext i32 %tid to i64
1901 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
1902 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
1903 %vec = load <4 x half>, ptr addrspace(1) %in.gep
1904 %val.trunc = trunc i32 %val to i16
1905 %val.cvt = bitcast i16 %val.trunc to half
1906 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1907 store <4 x half> %vecins, ptr addrspace(1) %out.gep
1911 define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1912 ; GFX9-LABEL: v_insertelement_v4i16_2:
1914 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1915 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1916 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1917 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1918 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1919 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1920 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
1921 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1922 ; GFX9-NEXT: v_bfi_b32 v1, s2, v3, v1
1923 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1924 ; GFX9-NEXT: s_endpgm
1926 ; VI-LABEL: v_insertelement_v4i16_2:
1928 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1929 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1930 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1931 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
1932 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1933 ; VI-NEXT: v_mov_b32_e32 v1, s3
1934 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1935 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1936 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1937 ; VI-NEXT: v_mov_b32_e32 v3, s1
1938 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1939 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1940 ; VI-NEXT: s_waitcnt vmcnt(0)
1941 ; VI-NEXT: v_perm_b32 v1, s4, v1, v4
1942 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1945 ; CI-LABEL: v_insertelement_v4i16_2:
1947 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1948 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1949 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1950 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1951 ; CI-NEXT: v_mov_b32_e32 v1, s3
1952 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1953 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1954 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1955 ; CI-NEXT: v_mov_b32_e32 v3, s1
1956 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1957 ; CI-NEXT: s_mov_b32 s0, 0xffff
1958 ; CI-NEXT: v_mov_b32_e32 v4, s4
1959 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1960 ; CI-NEXT: s_waitcnt vmcnt(0)
1961 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1962 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1965 ; GFX11-LABEL: v_insertelement_v4i16_2:
1967 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1968 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1969 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
1970 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1971 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
1972 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1973 ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1
1974 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1975 ; GFX11-NEXT: s_nop 0
1976 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1977 ; GFX11-NEXT: s_endpgm
1978 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1979 %tid.ext = sext i32 %tid to i64
1980 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
1981 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
1982 %vec = load <4 x i16>, ptr addrspace(1) %in.gep
1983 %val.trunc = trunc i32 %val to i16
1984 %val.cvt = bitcast i16 %val.trunc to i16
1985 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1986 store <4 x i16> %vecins, ptr addrspace(1) %out.gep
1990 ; FIXME: Better code on CI?
1991 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
1992 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1994 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1995 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
1996 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1997 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1998 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
1999 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2000 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
2001 ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
2002 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
2003 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
2004 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
2005 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2006 ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1
2007 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0
2008 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
2009 ; GFX9-NEXT: s_endpgm
2011 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2013 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2014 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
2015 ; VI-NEXT: s_waitcnt vmcnt(0)
2016 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2017 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
2018 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2019 ; VI-NEXT: v_mov_b32_e32 v1, s3
2020 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2021 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2022 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2023 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff
2024 ; VI-NEXT: v_mov_b32_e32 v3, s1
2025 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2026 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
2027 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2028 ; VI-NEXT: s_or_b32 s0, s4, s1
2029 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2030 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
2031 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
2032 ; VI-NEXT: s_waitcnt vmcnt(0)
2033 ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
2034 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
2035 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2038 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2040 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2041 ; CI-NEXT: flat_load_dword v4, v[0:1] glc
2042 ; CI-NEXT: s_waitcnt vmcnt(0)
2043 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2044 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
2045 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2046 ; CI-NEXT: v_mov_b32_e32 v1, s3
2047 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
2048 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2049 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2050 ; CI-NEXT: s_mov_b64 s[2:3], 0xffff
2051 ; CI-NEXT: v_mov_b32_e32 v3, s1
2052 ; CI-NEXT: s_lshl_b32 s1, s4, 16
2053 ; CI-NEXT: s_and_b32 s4, s4, 0xffff
2054 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
2055 ; CI-NEXT: s_or_b32 s0, s4, s1
2056 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2057 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
2058 ; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
2059 ; CI-NEXT: s_waitcnt vmcnt(0)
2060 ; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
2061 ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
2062 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2065 ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
2067 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2068 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc
2069 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2070 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
2071 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
2072 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2073 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2074 ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s0
2075 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
2076 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2077 ; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff
2078 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2079 ; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1
2080 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2081 ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0
2082 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
2083 ; GFX11-NEXT: s_nop 0
2084 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2085 ; GFX11-NEXT: s_endpgm
2086 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2087 %tid.ext = sext i32 %tid to i64
2088 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2089 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2090 %idx.val = load volatile i32, ptr addrspace(1) undef
2091 %vec = load <4 x i16>, ptr addrspace(1) %in.gep
2092 %val.trunc = trunc i32 %val to i16
2093 %val.cvt = bitcast i16 %val.trunc to i16
2094 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
2095 store <4 x i16> %vecins, ptr addrspace(1) %out.gep
2099 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
2100 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2102 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2103 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2104 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2105 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2106 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
2107 ; GFX9-NEXT: s_lshl_b32 s2, s7, 4
2108 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s6
2109 ; GFX9-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
2110 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
2111 ; GFX9-NEXT: v_mov_b32_e32 v4, s4
2112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2113 ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1
2114 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0
2115 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
2116 ; GFX9-NEXT: s_endpgm
2118 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2120 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2121 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
2122 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2123 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2124 ; VI-NEXT: v_mov_b32_e32 v1, s3
2125 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
2126 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2127 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2128 ; VI-NEXT: v_mov_b32_e32 v3, s1
2129 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2130 ; VI-NEXT: s_and_b32 s2, s4, 0xffff
2131 ; VI-NEXT: s_lshl_b32 s3, s5, 4
2132 ; VI-NEXT: s_or_b32 s2, s2, s1
2133 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
2134 ; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
2135 ; VI-NEXT: v_mov_b32_e32 v4, s2
2136 ; VI-NEXT: v_mov_b32_e32 v5, s2
2137 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2138 ; VI-NEXT: s_waitcnt vmcnt(0)
2139 ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1
2140 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0
2141 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2144 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2146 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2147 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
2148 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2149 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2150 ; CI-NEXT: v_mov_b32_e32 v1, s3
2151 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
2152 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2153 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2154 ; CI-NEXT: v_mov_b32_e32 v3, s1
2155 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
2156 ; CI-NEXT: s_lshl_b32 s2, s4, 16
2157 ; CI-NEXT: s_lshl_b32 s3, s5, 4
2158 ; CI-NEXT: s_or_b32 s2, s1, s2
2159 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
2160 ; CI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
2161 ; CI-NEXT: v_mov_b32_e32 v4, s2
2162 ; CI-NEXT: v_mov_b32_e32 v5, s2
2163 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2164 ; CI-NEXT: s_waitcnt vmcnt(0)
2165 ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1
2166 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0
2167 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2170 ; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr:
2172 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2173 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
2174 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
2175 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2176 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
2177 ; GFX11-NEXT: s_lshl_b32 s1, s1, 4
2178 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s0
2179 ; GFX11-NEXT: s_lshl_b64 s[0:1], 0xffff, s1
2180 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2181 ; GFX11-NEXT: v_bfi_b32 v1, s1, s2, v1
2182 ; GFX11-NEXT: v_bfi_b32 v0, s0, s2, v0
2183 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
2184 ; GFX11-NEXT: s_nop 0
2185 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2186 ; GFX11-NEXT: s_endpgm
2187 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2188 %tid.ext = sext i32 %tid to i64
2189 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
2190 %out.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %tid.ext
2191 %vec = load <4 x half>, ptr addrspace(1) %in.gep
2192 %val.trunc = trunc i32 %val to i16
2193 %val.cvt = bitcast i16 %val.trunc to half
2194 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
2195 store <4 x half> %vecins, ptr addrspace(1) %out.gep
2199 define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2200 ; GFX9-LABEL: v_insertelement_v8f16_3:
2202 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2203 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
2204 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2205 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x5040100
2206 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2207 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2208 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2209 ; GFX9-NEXT: v_perm_b32 v1, s6, v1, v5
2210 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2211 ; GFX9-NEXT: s_endpgm
2213 ; VI-LABEL: v_insertelement_v8f16_3:
2215 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2216 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
2217 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2218 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2219 ; VI-NEXT: v_mov_b32_e32 v1, s3
2220 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2221 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2222 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2223 ; VI-NEXT: v_mov_b32_e32 v5, s1
2224 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2225 ; VI-NEXT: s_mov_b32 s2, 0xffff
2226 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2227 ; VI-NEXT: v_mov_b32_e32 v6, s1
2228 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2229 ; VI-NEXT: s_waitcnt vmcnt(0)
2230 ; VI-NEXT: v_bfi_b32 v3, s2, v3, v3
2231 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2232 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2235 ; CI-LABEL: v_insertelement_v8f16_3:
2237 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2238 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
2239 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2240 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2241 ; CI-NEXT: v_mov_b32_e32 v1, s3
2242 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2243 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2244 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2245 ; CI-NEXT: v_mov_b32_e32 v5, s1
2246 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2247 ; CI-NEXT: s_lshl_b32 s0, s4, 16
2248 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2249 ; CI-NEXT: s_waitcnt vmcnt(0)
2250 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
2251 ; CI-NEXT: v_or_b32_e32 v1, s0, v1
2252 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2255 ; GFX11-LABEL: v_insertelement_v8f16_3:
2257 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2258 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2259 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
2260 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2261 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
2262 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100
2264 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
2265 ; GFX11-NEXT: s_nop 0
2266 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2267 ; GFX11-NEXT: s_endpgm
2268 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2269 %tid.ext = sext i32 %tid to i64
2270 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2271 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2272 %vec = load <8 x half>, ptr addrspace(1) %in.gep
2273 %val.trunc = trunc i32 %val to i16
2274 %val.cvt = bitcast i16 %val.trunc to half
2275 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
2276 store <8 x half> %vecins, ptr addrspace(1) %out.gep
2280 define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2281 ; GFX9-LABEL: v_insertelement_v8i16_6:
2283 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2284 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
2285 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2286 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2287 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2288 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
2289 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
2290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2291 ; GFX9-NEXT: v_bfi_b32 v3, s2, v5, v3
2292 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2293 ; GFX9-NEXT: s_endpgm
2295 ; VI-LABEL: v_insertelement_v8i16_6:
2297 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2298 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
2299 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2300 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2301 ; VI-NEXT: v_mov_b32_e32 v1, s3
2302 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2303 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2304 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2305 ; VI-NEXT: s_mov_b32 s2, 0xffff
2306 ; VI-NEXT: v_mov_b32_e32 v5, s1
2307 ; VI-NEXT: v_mov_b32_e32 v6, s4
2308 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2309 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2310 ; VI-NEXT: s_waitcnt vmcnt(0)
2311 ; VI-NEXT: v_bfi_b32 v3, s2, v6, v3
2312 ; VI-NEXT: v_bfi_b32 v1, s2, v1, v1
2313 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2316 ; CI-LABEL: v_insertelement_v8i16_6:
2318 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2319 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
2320 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2321 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2322 ; CI-NEXT: v_mov_b32_e32 v1, s3
2323 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2324 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2325 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2326 ; CI-NEXT: v_mov_b32_e32 v5, s1
2327 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2328 ; CI-NEXT: s_mov_b32 s0, 0xffff
2329 ; CI-NEXT: v_mov_b32_e32 v6, s4
2330 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2331 ; CI-NEXT: s_waitcnt vmcnt(0)
2332 ; CI-NEXT: v_bfi_b32 v3, s0, v6, v3
2333 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2336 ; GFX11-LABEL: v_insertelement_v8i16_6:
2338 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2339 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2340 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
2341 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2342 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
2343 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2344 ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3
2345 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
2346 ; GFX11-NEXT: s_nop 0
2347 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2348 ; GFX11-NEXT: s_endpgm
2349 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2350 %tid.ext = sext i32 %tid to i64
2351 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2352 %out.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2353 %vec = load <8 x i16>, ptr addrspace(1) %in.gep
2354 %val.trunc = trunc i32 %val to i16
2355 %val.cvt = bitcast i16 %val.trunc to i16
2356 %vecins = insertelement <8 x i16> %vec, i16 %val.cvt, i32 6
2357 store <8 x i16> %vecins, ptr addrspace(1) %out.gep
2361 define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2362 ; GFX9-LABEL: v_insertelement_v8f16_dynamic:
2364 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2365 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2366 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2367 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2368 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
2369 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6
2370 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
2371 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2372 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7
2373 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
2374 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2375 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc
2376 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2377 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2378 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4
2379 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
2380 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2381 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5
2382 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2
2383 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
2384 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2385 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2
2386 ; GFX9-NEXT: v_perm_b32 v3, v3, v6, s2
2387 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc
2388 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2389 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3
2390 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v1
2391 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
2392 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2393 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0
2394 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s2
2395 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc
2396 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2397 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1
2398 ; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2399 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
2400 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2401 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
2402 ; GFX9-NEXT: v_perm_b32 v1, v6, v1, s2
2403 ; GFX9-NEXT: v_perm_b32 v0, v5, v0, s2
2404 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2405 ; GFX9-NEXT: s_endpgm
2407 ; VI-LABEL: v_insertelement_v8f16_dynamic:
2409 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2410 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
2411 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2412 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2413 ; VI-NEXT: v_mov_b32_e32 v1, s3
2414 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2415 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2416 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2417 ; VI-NEXT: v_mov_b32_e32 v5, s1
2418 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
2419 ; VI-NEXT: s_cmp_eq_u32 s5, 6
2420 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2421 ; VI-NEXT: v_mov_b32_e32 v6, s4
2422 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2423 ; VI-NEXT: s_cmp_eq_u32 s5, 7
2424 ; VI-NEXT: s_waitcnt vmcnt(0)
2425 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
2426 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2427 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2428 ; VI-NEXT: s_cmp_eq_u32 s5, 4
2429 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
2430 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2431 ; VI-NEXT: s_cmp_eq_u32 s5, 5
2432 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2433 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
2434 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2435 ; VI-NEXT: s_cmp_eq_u32 s5, 2
2436 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2437 ; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
2438 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2439 ; VI-NEXT: s_cmp_eq_u32 s5, 3
2440 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2441 ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2442 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2443 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2444 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2445 ; VI-NEXT: s_cmp_eq_u32 s5, 0
2446 ; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2447 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
2448 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2449 ; VI-NEXT: s_cmp_eq_u32 s5, 1
2450 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
2451 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2452 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2453 ; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
2454 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2455 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
2456 ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2457 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2458 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2461 ; CI-LABEL: v_insertelement_v8f16_dynamic:
2463 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2464 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
2465 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2466 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2467 ; CI-NEXT: v_mov_b32_e32 v1, s3
2468 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2469 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2470 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2471 ; CI-NEXT: v_mov_b32_e32 v5, s1
2472 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2473 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
2474 ; CI-NEXT: s_cmp_eq_u32 s5, 7
2475 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2476 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2477 ; CI-NEXT: s_cmp_eq_u32 s5, 6
2478 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2479 ; CI-NEXT: s_cmp_eq_u32 s5, 5
2480 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
2481 ; CI-NEXT: s_cmp_eq_u32 s5, 4
2482 ; CI-NEXT: s_waitcnt vmcnt(0)
2483 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
2484 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2485 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
2486 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
2487 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
2488 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2489 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2490 ; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
2491 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
2492 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
2493 ; CI-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2494 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2495 ; CI-NEXT: s_cmp_eq_u32 s5, 3
2496 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2497 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2498 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
2499 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2500 ; CI-NEXT: s_cmp_eq_u32 s5, 2
2501 ; CI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc
2502 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2503 ; CI-NEXT: s_cmp_eq_u32 s5, 1
2504 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2505 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2506 ; CI-NEXT: s_cmp_eq_u32 s5, 0
2507 ; CI-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3]
2508 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2509 ; CI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
2510 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2511 ; CI-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2512 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2513 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
2514 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
2515 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
2516 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2517 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2518 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2519 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2520 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v7
2521 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
2522 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
2523 ; CI-NEXT: v_or_b32_e32 v3, v3, v6
2524 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v10
2525 ; CI-NEXT: v_or_b32_e32 v2, v2, v7
2526 ; CI-NEXT: v_or_b32_e32 v1, v1, v8
2527 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
2528 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2531 ; GFX11-LABEL: v_insertelement_v8f16_dynamic:
2533 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2534 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
2535 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
2536 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2537 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
2538 ; GFX11-NEXT: s_cmp_eq_u32 s1, 6
2539 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2540 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7
2541 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2542 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s0, s2
2543 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2544 ; GFX11-NEXT: s_cmp_eq_u32 s1, 4
2545 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2546 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
2547 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5
2548 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
2549 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3
2550 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
2551 ; GFX11-NEXT: s_cmp_eq_u32 s1, 2
2552 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
2553 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2
2554 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2555 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3
2556 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2
2557 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2558 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0
2559 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0
2560 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2
2561 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
2562 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1
2563 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3
2564 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0
2565 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
2566 ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s1
2567 ; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100
2568 ; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
2569 ; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
2570 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
2571 ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
2572 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
2573 ; GFX11-NEXT: s_nop 0
2574 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2575 ; GFX11-NEXT: s_endpgm
2576 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2577 %tid.ext = sext i32 %tid to i64
2578 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
2579 %out.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %out, i64 %tid.ext
2580 %vec = load <8 x half>, ptr addrspace(1) %in.gep
2581 %val.trunc = trunc i32 %val to i16
2582 %val.cvt = bitcast i16 %val.trunc to half
2583 %vecins = insertelement <8 x half> %vec, half %val.cvt, i32 %n
2584 store <8 x half> %vecins, ptr addrspace(1) %out.gep
2588 define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2589 ; GFX9-LABEL: v_insertelement_v16f16_3:
2591 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2592 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
2593 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2594 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x5040100
2595 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2596 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
2597 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2598 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2599 ; GFX9-NEXT: v_perm_b32 v1, s6, v1, v9
2600 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2601 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2602 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
2603 ; GFX9-NEXT: s_endpgm
2605 ; VI-LABEL: v_insertelement_v16f16_3:
2607 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2608 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
2609 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2610 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2611 ; VI-NEXT: v_mov_b32_e32 v1, s3
2612 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
2613 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2614 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
2615 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2616 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2617 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2618 ; VI-NEXT: v_mov_b32_e32 v9, s1
2619 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2620 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2621 ; VI-NEXT: s_lshl_b32 s1, s4, 16
2622 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2623 ; VI-NEXT: v_mov_b32_e32 v12, s1
2624 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2625 ; VI-NEXT: s_waitcnt vmcnt(1)
2626 ; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2627 ; VI-NEXT: s_waitcnt vmcnt(0)
2628 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2629 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2632 ; CI-LABEL: v_insertelement_v16f16_3:
2634 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2635 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
2636 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2637 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2638 ; CI-NEXT: v_mov_b32_e32 v0, s3
2639 ; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
2640 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
2641 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
2642 ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
2643 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2644 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2645 ; CI-NEXT: v_mov_b32_e32 v9, s1
2646 ; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8
2647 ; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2648 ; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8
2649 ; CI-NEXT: s_lshl_b32 s1, s4, 16
2650 ; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2651 ; CI-NEXT: s_waitcnt vmcnt(1)
2652 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
2653 ; CI-NEXT: v_or_b32_e32 v1, s1, v1
2654 ; CI-NEXT: s_waitcnt vmcnt(0)
2655 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2656 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2659 ; GFX11-LABEL: v_insertelement_v16f16_3:
2661 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2662 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2663 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
2664 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2665 ; GFX11-NEXT: s_clause 0x1
2666 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
2667 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16
2668 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2669 ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100
2670 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2671 ; GFX11-NEXT: s_clause 0x1
2672 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
2673 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5]
2674 ; GFX11-NEXT: s_nop 0
2675 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2676 ; GFX11-NEXT: s_endpgm
2677 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2678 %tid.ext = sext i32 %tid to i64
2679 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
2680 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
2681 %vec = load <16 x half>, ptr addrspace(1) %in.gep
2682 %val.trunc = trunc i32 %val to i16
2683 %val.cvt = bitcast i16 %val.trunc to half
2684 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 3
2685 store <16 x half> %vecins, ptr addrspace(1) %out.gep
2689 define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
2690 ; GFX9-LABEL: v_insertelement_v16i16_6:
2692 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2693 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
2694 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2695 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2696 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
2697 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
2698 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
2699 ; GFX9-NEXT: v_mov_b32_e32 v9, s6
2700 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2701 ; GFX9-NEXT: v_bfi_b32 v3, s2, v9, v3
2702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2703 ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
2704 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
2705 ; GFX9-NEXT: s_endpgm
2707 ; VI-LABEL: v_insertelement_v16i16_6:
2709 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2710 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
2711 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2712 ; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
2713 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2714 ; VI-NEXT: v_mov_b32_e32 v1, s3
2715 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
2716 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2717 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
2718 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2719 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2720 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2721 ; VI-NEXT: v_mov_b32_e32 v9, s1
2722 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2723 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2724 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2725 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2726 ; VI-NEXT: s_waitcnt vmcnt(1)
2727 ; VI-NEXT: v_perm_b32 v3, s4, v3, v12
2728 ; VI-NEXT: s_waitcnt vmcnt(0)
2729 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2730 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2733 ; CI-LABEL: v_insertelement_v16i16_6:
2735 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2736 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
2737 ; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2738 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2739 ; CI-NEXT: v_mov_b32_e32 v1, s3
2740 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
2741 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2742 ; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
2743 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2744 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2745 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2746 ; CI-NEXT: v_mov_b32_e32 v9, s1
2747 ; CI-NEXT: v_add_i32_e32 v8, vcc, s0, v8
2748 ; CI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2749 ; CI-NEXT: v_add_i32_e32 v10, vcc, 16, v8
2750 ; CI-NEXT: s_mov_b32 s2, 0xffff
2751 ; CI-NEXT: v_mov_b32_e32 v12, s4
2752 ; CI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2753 ; CI-NEXT: s_waitcnt vmcnt(1)
2754 ; CI-NEXT: v_bfi_b32 v3, s2, v12, v3
2755 ; CI-NEXT: s_waitcnt vmcnt(0)
2756 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2757 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2760 ; GFX11-LABEL: v_insertelement_v16i16_6:
2762 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2763 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2764 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
2765 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2766 ; GFX11-NEXT: s_clause 0x1
2767 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
2768 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16
2769 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2770 ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3
2771 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2772 ; GFX11-NEXT: s_clause 0x1
2773 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
2774 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5]
2775 ; GFX11-NEXT: s_nop 0
2776 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2777 ; GFX11-NEXT: s_endpgm
2778 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
2779 %tid.ext = sext i32 %tid to i64
2780 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
2781 %out.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %out, i64 %tid.ext
2782 %vec = load <16 x i16>, ptr addrspace(1) %in.gep
2783 %val.trunc = trunc i32 %val to i16
2784 %val.cvt = bitcast i16 %val.trunc to i16
2785 %vecins = insertelement <16 x i16> %vec, i16 %val.cvt, i32 6
2786 store <16 x i16> %vecins, ptr addrspace(1) %out.gep
2790 define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
2791 ; GFX9-LABEL: v_insertelement_v16f16_dynamic:
2793 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2794 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2795 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 5, v0
2796 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2797 ; GFX9-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
2798 ; GFX9-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
2799 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6
2800 ; GFX9-NEXT: v_mov_b32_e32 v9, s6
2801 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2802 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7
2803 ; GFX9-NEXT: s_mov_b32 s2, 0x5040100
2804 ; GFX9-NEXT: s_waitcnt vmcnt(1)
2805 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc
2806 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
2807 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2808 ; GFX9-NEXT: s_cmp_eq_u32 s7, 4
2809 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
2810 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2811 ; GFX9-NEXT: s_cmp_eq_u32 s7, 5
2812 ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v3
2813 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
2814 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2815 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2
2816 ; GFX9-NEXT: v_perm_b32 v4, v4, v10, s2
2817 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc
2818 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2819 ; GFX9-NEXT: s_cmp_eq_u32 s7, 3
2820 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v2
2821 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
2822 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2823 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0
2824 ; GFX9-NEXT: v_perm_b32 v3, v10, v3, s2
2825 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
2826 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2827 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1
2828 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1
2829 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
2830 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2831 ; GFX9-NEXT: s_cmp_eq_u32 s7, 14
2832 ; GFX9-NEXT: v_perm_b32 v2, v10, v2, s2
2833 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
2834 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2835 ; GFX9-NEXT: s_cmp_eq_u32 s7, 15
2836 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2837 ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v8
2838 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
2839 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2840 ; GFX9-NEXT: s_cmp_eq_u32 s7, 12
2841 ; GFX9-NEXT: v_perm_b32 v1, v10, v1, s2
2842 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
2843 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2844 ; GFX9-NEXT: s_cmp_eq_u32 s7, 13
2845 ; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7
2846 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
2847 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2848 ; GFX9-NEXT: s_cmp_eq_u32 s7, 10
2849 ; GFX9-NEXT: v_perm_b32 v8, v10, v8, s2
2850 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
2851 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2852 ; GFX9-NEXT: s_cmp_eq_u32 s7, 11
2853 ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6
2854 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
2855 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2856 ; GFX9-NEXT: s_cmp_eq_u32 s7, 8
2857 ; GFX9-NEXT: v_perm_b32 v7, v10, v7, s2
2858 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
2859 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2860 ; GFX9-NEXT: s_cmp_eq_u32 s7, 9
2861 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v5
2862 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
2863 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
2864 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
2865 ; GFX9-NEXT: v_perm_b32 v6, v10, v6, s2
2866 ; GFX9-NEXT: v_perm_b32 v5, v9, v5, s2
2867 ; GFX9-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
2868 ; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
2869 ; GFX9-NEXT: s_endpgm
2871 ; VI-LABEL: v_insertelement_v16f16_dynamic:
2873 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2874 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
2875 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
2876 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2877 ; VI-NEXT: v_mov_b32_e32 v0, s3
2878 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
2879 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
2880 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
2881 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
2882 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2883 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2884 ; VI-NEXT: v_mov_b32_e32 v9, s1
2885 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
2886 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
2887 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
2888 ; VI-NEXT: s_cmp_eq_u32 s5, 14
2889 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
2890 ; VI-NEXT: v_mov_b32_e32 v12, s4
2891 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2892 ; VI-NEXT: s_cmp_eq_u32 s5, 15
2893 ; VI-NEXT: s_waitcnt vmcnt(1)
2894 ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc
2895 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2896 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2897 ; VI-NEXT: s_cmp_eq_u32 s5, 12
2898 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
2899 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2900 ; VI-NEXT: s_cmp_eq_u32 s5, 13
2901 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
2902 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
2903 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2904 ; VI-NEXT: s_cmp_eq_u32 s5, 10
2905 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2906 ; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
2907 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2908 ; VI-NEXT: s_cmp_eq_u32 s5, 11
2909 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
2910 ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2911 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
2912 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
2913 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2914 ; VI-NEXT: s_cmp_eq_u32 s5, 8
2915 ; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2916 ; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc
2917 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2918 ; VI-NEXT: s_cmp_eq_u32 s5, 9
2919 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
2920 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2921 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
2922 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2923 ; VI-NEXT: s_cmp_eq_u32 s5, 6
2924 ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2925 ; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc
2926 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2927 ; VI-NEXT: s_cmp_eq_u32 s5, 7
2928 ; VI-NEXT: s_waitcnt vmcnt(0)
2929 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
2930 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2931 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
2932 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2933 ; VI-NEXT: s_cmp_eq_u32 s5, 4
2934 ; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2935 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc
2936 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2937 ; VI-NEXT: s_cmp_eq_u32 s5, 5
2938 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6
2939 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2940 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
2941 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2942 ; VI-NEXT: s_cmp_eq_u32 s5, 2
2943 ; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2944 ; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc
2945 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2946 ; VI-NEXT: s_cmp_eq_u32 s5, 3
2947 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
2948 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2949 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
2950 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2951 ; VI-NEXT: s_cmp_eq_u32 s5, 0
2952 ; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2953 ; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc
2954 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2955 ; VI-NEXT: s_cmp_eq_u32 s5, 1
2956 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4
2957 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
2958 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2959 ; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
2960 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
2961 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
2962 ; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2963 ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2964 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
2965 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
2968 ; CI-LABEL: v_insertelement_v16f16_dynamic:
2970 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2971 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
2972 ; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
2973 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2974 ; CI-NEXT: v_mov_b32_e32 v1, s3
2975 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2976 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2977 ; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
2978 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2979 ; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
2980 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2981 ; CI-NEXT: v_mov_b32_e32 v5, s1
2982 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
2983 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s4
2984 ; CI-NEXT: s_cmp_eq_u32 s5, 15
2985 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
2986 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
2987 ; CI-NEXT: s_cmp_eq_u32 s5, 14
2988 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2989 ; CI-NEXT: s_cmp_eq_u32 s5, 13
2990 ; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
2991 ; CI-NEXT: s_cmp_eq_u32 s5, 12
2992 ; CI-NEXT: s_waitcnt vmcnt(1)
2993 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
2994 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
2995 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
2996 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
2997 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
2998 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
2999 ; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
3000 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
3001 ; CI-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[0:1]
3002 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
3003 ; CI-NEXT: s_cmp_eq_u32 s5, 11
3004 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
3005 ; CI-NEXT: s_waitcnt vmcnt(0)
3006 ; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
3007 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
3008 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3009 ; CI-NEXT: s_cmp_eq_u32 s5, 10
3010 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
3011 ; CI-NEXT: v_cndmask_b32_e32 v13, v13, v6, vcc
3012 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3013 ; CI-NEXT: s_cmp_eq_u32 s5, 9
3014 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
3015 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
3016 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3017 ; CI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
3018 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3019 ; CI-NEXT: s_cmp_eq_u32 s5, 8
3020 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
3021 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
3022 ; CI-NEXT: v_cndmask_b32_e32 v14, v14, v6, vcc
3023 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3024 ; CI-NEXT: s_cmp_eq_u32 s5, 7
3025 ; CI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
3026 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
3027 ; CI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
3028 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3029 ; CI-NEXT: s_cmp_eq_u32 s5, 6
3030 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
3031 ; CI-NEXT: v_cndmask_b32_e32 v15, v15, v6, vcc
3032 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3033 ; CI-NEXT: s_cmp_eq_u32 s5, 5
3034 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
3035 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
3036 ; CI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
3037 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3038 ; CI-NEXT: s_cmp_eq_u32 s5, 4
3039 ; CI-NEXT: v_or_b32_e32 v10, v10, v11
3040 ; CI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc
3041 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3042 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3043 ; CI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
3044 ; CI-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[2:3]
3045 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
3046 ; CI-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
3047 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3048 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
3049 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
3050 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
3051 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
3052 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
3053 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
3054 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
3055 ; CI-NEXT: v_or_b32_e32 v2, v2, v11
3056 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
3057 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
3058 ; CI-NEXT: s_cmp_eq_u32 s5, 3
3059 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
3060 ; CI-NEXT: v_or_b32_e32 v9, v9, v12
3061 ; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v14
3062 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3063 ; CI-NEXT: s_cmp_eq_u32 s5, 2
3064 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
3065 ; CI-NEXT: v_or_b32_e32 v7, v7, v12
3066 ; CI-NEXT: v_cndmask_b32_e32 v12, v17, v6, vcc
3067 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3068 ; CI-NEXT: s_cmp_eq_u32 s5, 1
3069 ; CI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3070 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3071 ; CI-NEXT: s_cmp_eq_u32 s5, 0
3072 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
3073 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
3074 ; CI-NEXT: v_cndmask_b32_e32 v11, v11, v6, vcc
3075 ; CI-NEXT: s_cselect_b64 vcc, -1, 0
3076 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
3077 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
3078 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
3079 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
3080 ; CI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
3081 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
3082 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
3083 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
3084 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v12
3085 ; CI-NEXT: v_or_b32_e32 v8, v8, v13
3086 ; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v15
3087 ; CI-NEXT: v_or_b32_e32 v1, v1, v6
3088 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
3089 ; CI-NEXT: v_or_b32_e32 v3, v3, v13
3090 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
3091 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3093 ; CI-NEXT: v_add_i32_e32 v0, vcc, 16, v4
3094 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
3095 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
3098 ; GFX11-LABEL: v_insertelement_v16f16_dynamic:
3100 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
3101 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
3102 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
3103 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3104 ; GFX11-NEXT: s_clause 0x1
3105 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
3106 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[6:7] offset:16
3107 ; GFX11-NEXT: s_cmp_eq_u32 s1, 6
3108 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3109 ; GFX11-NEXT: s_cmp_eq_u32 s1, 7
3110 ; GFX11-NEXT: s_waitcnt vmcnt(1)
3111 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s0, s2
3112 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3113 ; GFX11-NEXT: s_cmp_eq_u32 s1, 4
3114 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
3115 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
3116 ; GFX11-NEXT: s_cmp_eq_u32 s1, 5
3117 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2
3118 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s3
3119 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
3120 ; GFX11-NEXT: s_cmp_eq_u32 s1, 2
3121 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1
3122 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s2
3123 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3124 ; GFX11-NEXT: s_cmp_eq_u32 s1, 3
3125 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2
3126 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3127 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0
3128 ; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0
3129 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s0, s2
3130 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3131 ; GFX11-NEXT: s_cmp_eq_u32 s1, 1
3132 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2
3133 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3134 ; GFX11-NEXT: s_cmp_eq_u32 s1, 14
3135 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3136 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7
3137 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s0, s3
3138 ; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
3139 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s0, s2
3140 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3141 ; GFX11-NEXT: s_cmp_eq_u32 s1, 15
3142 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s2
3143 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3144 ; GFX11-NEXT: s_cmp_eq_u32 s1, 12
3145 ; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6
3146 ; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
3147 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s0, s2
3148 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3149 ; GFX11-NEXT: s_cmp_eq_u32 s1, 13
3150 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s2
3151 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3152 ; GFX11-NEXT: s_cmp_eq_u32 s1, 10
3153 ; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5
3154 ; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s0, s2
3155 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3156 ; GFX11-NEXT: s_cmp_eq_u32 s1, 11
3157 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2
3158 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3159 ; GFX11-NEXT: s_cmp_eq_u32 s1, 8
3160 ; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4
3161 ; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s0, s2
3162 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
3163 ; GFX11-NEXT: s_cmp_eq_u32 s1, 9
3164 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s0, s2
3165 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0
3166 ; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100
3167 ; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s0, s1
3168 ; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100
3169 ; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
3170 ; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
3171 ; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
3172 ; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100
3173 ; GFX11-NEXT: s_clause 0x1
3174 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
3175 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5]
3176 ; GFX11-NEXT: s_nop 0
3177 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3178 ; GFX11-NEXT: s_endpgm
3179 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
3180 %tid.ext = sext i32 %tid to i64
3181 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
3182 %out.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %out, i64 %tid.ext
3183 %vec = load <16 x half>, ptr addrspace(1) %in.gep
3184 %val.trunc = trunc i32 %val to i16
3185 %val.cvt = bitcast i16 %val.trunc to half
3186 %vecins = insertelement <16 x half> %vec, half %val.cvt, i32 %n
3187 store <16 x half> %vecins, ptr addrspace(1) %out.gep
3192 declare i32 @llvm.amdgcn.workitem.id.x() #1
3194 attributes #0 = { nounwind }
3195 attributes #1 = { nounwind readnone }