1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
4 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
6 define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
7 ; GFX9-LABEL: s_insertelement_v2i16_0:
9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
10 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
11 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
13 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
16 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
19 ; CIVI-LABEL: s_insertelement_v2i16_0:
21 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
22 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
23 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
24 ; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
25 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
26 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
27 ; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000
28 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
29 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
30 ; CIVI-NEXT: flat_store_dword v[0:1], v2
32 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
33 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
34 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
39 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
40 ; GFX9-LABEL: s_insertelement_v2i16_0_reg:
42 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
43 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
44 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
45 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
47 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
48 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s6, s2
49 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
50 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
53 ; VI-LABEL: s_insertelement_v2i16_0_reg:
55 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
56 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
57 ; VI-NEXT: s_waitcnt lgkmcnt(0)
58 ; VI-NEXT: v_mov_b32_e32 v0, s0
59 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
60 ; VI-NEXT: v_mov_b32_e32 v1, s1
61 ; VI-NEXT: s_and_b32 s1, s4, 0xffff
62 ; VI-NEXT: s_waitcnt lgkmcnt(0)
63 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
64 ; VI-NEXT: s_or_b32 s0, s1, s0
65 ; VI-NEXT: v_mov_b32_e32 v2, s0
66 ; VI-NEXT: flat_store_dword v[0:1], v2
69 ; CI-LABEL: s_insertelement_v2i16_0_reg:
71 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
72 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
73 ; CI-NEXT: s_waitcnt lgkmcnt(0)
74 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
75 ; CI-NEXT: v_mov_b32_e32 v0, s0
76 ; CI-NEXT: v_mov_b32_e32 v1, s1
77 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
78 ; CI-NEXT: s_waitcnt lgkmcnt(0)
79 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
80 ; CI-NEXT: s_or_b32 s0, s1, s0
81 ; CI-NEXT: v_mov_b32_e32 v2, s0
82 ; CI-NEXT: flat_store_dword v[0:1], v2
84 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
85 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
86 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
90 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
91 ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
93 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
94 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
95 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
96 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
98 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
99 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
100 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s2
101 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
102 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
103 ; GFX9-NEXT: ;;#ASMSTART
104 ; GFX9-NEXT: ; use s2
105 ; GFX9-NEXT: ;;#ASMEND
106 ; GFX9-NEXT: s_endpgm
108 ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
110 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
111 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
112 ; VI-NEXT: s_waitcnt lgkmcnt(0)
113 ; VI-NEXT: v_mov_b32_e32 v0, s0
114 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
115 ; VI-NEXT: v_mov_b32_e32 v1, s1
116 ; VI-NEXT: s_and_b32 s1, s4, 0xffff
117 ; VI-NEXT: s_waitcnt lgkmcnt(0)
118 ; VI-NEXT: s_lshr_b32 s2, s0, 16
119 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
120 ; VI-NEXT: s_or_b32 s0, s1, s0
121 ; VI-NEXT: v_mov_b32_e32 v2, s0
122 ; VI-NEXT: flat_store_dword v[0:1], v2
123 ; VI-NEXT: ;;#ASMSTART
128 ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
130 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
131 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
132 ; CI-NEXT: s_waitcnt lgkmcnt(0)
133 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
134 ; CI-NEXT: v_mov_b32_e32 v1, s1
135 ; CI-NEXT: v_mov_b32_e32 v0, s0
136 ; CI-NEXT: s_and_b32 s0, s4, 0xffff
137 ; CI-NEXT: s_waitcnt lgkmcnt(0)
138 ; CI-NEXT: s_lshr_b32 s1, s2, 16
139 ; CI-NEXT: s_lshl_b32 s2, s1, 16
140 ; CI-NEXT: s_or_b32 s0, s0, s2
141 ; CI-NEXT: v_mov_b32_e32 v2, s0
142 ; CI-NEXT: flat_store_dword v[0:1], v2
143 ; CI-NEXT: ;;#ASMSTART
147 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
148 %elt1 = extractelement <2 x i16> %vec, i32 1
149 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
150 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
151 %use1 = zext i16 %elt1 to i32
152 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
156 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
157 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
159 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
160 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
161 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
162 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
164 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX9-NEXT: s_pack_hh_b32_b16 s2, s6, s2
166 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
167 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
168 ; GFX9-NEXT: s_endpgm
170 ; VI-LABEL: s_insertelement_v2i16_0_reghi:
172 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
173 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
174 ; VI-NEXT: s_waitcnt lgkmcnt(0)
175 ; VI-NEXT: v_mov_b32_e32 v0, s0
176 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
177 ; VI-NEXT: v_mov_b32_e32 v1, s1
178 ; VI-NEXT: s_lshr_b32 s1, s4, 16
179 ; VI-NEXT: s_waitcnt lgkmcnt(0)
180 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
181 ; VI-NEXT: s_or_b32 s0, s1, s0
182 ; VI-NEXT: v_mov_b32_e32 v2, s0
183 ; VI-NEXT: flat_store_dword v[0:1], v2
186 ; CI-LABEL: s_insertelement_v2i16_0_reghi:
188 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
189 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
190 ; CI-NEXT: s_waitcnt lgkmcnt(0)
191 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
192 ; CI-NEXT: v_mov_b32_e32 v0, s0
193 ; CI-NEXT: v_mov_b32_e32 v1, s1
194 ; CI-NEXT: s_lshr_b32 s1, s4, 16
195 ; CI-NEXT: s_waitcnt lgkmcnt(0)
196 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
197 ; CI-NEXT: s_or_b32 s0, s1, s0
198 ; CI-NEXT: v_mov_b32_e32 v2, s0
199 ; CI-NEXT: flat_store_dword v[0:1], v2
201 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
202 %elt.hi = lshr i32 %elt.arg, 16
203 %elt = trunc i32 %elt.hi to i16
204 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
205 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
209 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
210 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
212 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
213 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
214 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
215 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
217 ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
218 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX9-NEXT: s_pack_lh_b32_b16 s2, s3, s2
220 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
221 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
222 ; GFX9-NEXT: ;;#ASMSTART
223 ; GFX9-NEXT: ; use s3
224 ; GFX9-NEXT: ;;#ASMEND
225 ; GFX9-NEXT: s_endpgm
227 ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
229 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
230 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
231 ; VI-NEXT: s_waitcnt lgkmcnt(0)
232 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
233 ; VI-NEXT: v_mov_b32_e32 v0, s0
234 ; VI-NEXT: v_mov_b32_e32 v1, s1
235 ; VI-NEXT: s_lshr_b32 s0, s4, 16
236 ; VI-NEXT: s_waitcnt lgkmcnt(0)
237 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
238 ; VI-NEXT: s_or_b32 s1, s0, s1
239 ; VI-NEXT: v_mov_b32_e32 v2, s1
240 ; VI-NEXT: flat_store_dword v[0:1], v2
241 ; VI-NEXT: ;;#ASMSTART
246 ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
248 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
249 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
250 ; CI-NEXT: s_waitcnt lgkmcnt(0)
251 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
252 ; CI-NEXT: v_mov_b32_e32 v0, s0
253 ; CI-NEXT: v_mov_b32_e32 v1, s1
254 ; CI-NEXT: s_lshr_b32 s0, s4, 16
255 ; CI-NEXT: s_waitcnt lgkmcnt(0)
256 ; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
257 ; CI-NEXT: s_or_b32 s1, s0, s1
258 ; CI-NEXT: v_mov_b32_e32 v2, s1
259 ; CI-NEXT: flat_store_dword v[0:1], v2
260 ; CI-NEXT: ;;#ASMSTART
264 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
265 %elt.hi = lshr i32 %elt.arg, 16
266 %elt = trunc i32 %elt.hi to i16
267 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
268 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
269 %use1 = zext i16 %elt to i32
270 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
274 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
275 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
277 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
278 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
279 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
280 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
281 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
282 ; GFX9-NEXT: s_lshr_b32 s3, s6, 16
283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
285 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s3, s2
286 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
287 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
288 ; GFX9-NEXT: ;;#ASMSTART
289 ; GFX9-NEXT: ; use s3
290 ; GFX9-NEXT: ;;#ASMEND
291 ; GFX9-NEXT: ;;#ASMSTART
292 ; GFX9-NEXT: ; use s2
293 ; GFX9-NEXT: ;;#ASMEND
294 ; GFX9-NEXT: s_endpgm
296 ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
298 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
299 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
300 ; VI-NEXT: s_waitcnt lgkmcnt(0)
301 ; VI-NEXT: v_mov_b32_e32 v0, s0
302 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
303 ; VI-NEXT: v_mov_b32_e32 v1, s1
304 ; VI-NEXT: s_lshr_b32 s1, s4, 16
305 ; VI-NEXT: s_waitcnt lgkmcnt(0)
306 ; VI-NEXT: s_lshr_b32 s2, s0, 16
307 ; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
308 ; VI-NEXT: s_or_b32 s0, s1, s0
309 ; VI-NEXT: v_mov_b32_e32 v2, s0
310 ; VI-NEXT: flat_store_dword v[0:1], v2
311 ; VI-NEXT: ;;#ASMSTART
314 ; VI-NEXT: ;;#ASMSTART
319 ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
321 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
322 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
323 ; CI-NEXT: s_waitcnt lgkmcnt(0)
324 ; CI-NEXT: v_mov_b32_e32 v0, s0
325 ; CI-NEXT: s_load_dword s0, s[2:3], 0x0
326 ; CI-NEXT: v_mov_b32_e32 v2, s4
327 ; CI-NEXT: v_mov_b32_e32 v1, s1
328 ; CI-NEXT: s_lshr_b32 s1, s4, 16
329 ; CI-NEXT: s_waitcnt lgkmcnt(0)
330 ; CI-NEXT: s_lshr_b32 s0, s0, 16
331 ; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16
332 ; CI-NEXT: flat_store_dword v[0:1], v2
333 ; CI-NEXT: ;;#ASMSTART
336 ; CI-NEXT: ;;#ASMSTART
340 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
341 %elt.hi = lshr i32 %elt.arg, 16
342 %elt = trunc i32 %elt.hi to i16
343 %vec.hi = extractelement <2 x i16> %vec, i32 1
344 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
345 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
346 %use1 = zext i16 %elt to i32
347 %vec.hi.use1 = zext i16 %vec.hi to i32
349 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
350 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
354 define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
355 ; GFX9-LABEL: s_insertelement_v2i16_1:
357 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
358 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
359 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
361 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
362 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7
363 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
364 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
365 ; GFX9-NEXT: s_endpgm
367 ; CIVI-LABEL: s_insertelement_v2i16_1:
369 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
370 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
371 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
372 ; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
373 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
374 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
375 ; CIVI-NEXT: s_and_b32 s0, s0, 0xffff
376 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000
377 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
378 ; CIVI-NEXT: flat_store_dword v[0:1], v2
379 ; CIVI-NEXT: s_endpgm
380 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
381 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
382 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
386 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
387 ; GFX9-LABEL: s_insertelement_v2i16_1_reg:
389 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
390 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
391 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
392 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
393 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
394 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
395 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6
396 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
397 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
398 ; GFX9-NEXT: s_endpgm
400 ; VI-LABEL: s_insertelement_v2i16_1_reg:
402 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
403 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
404 ; VI-NEXT: s_waitcnt lgkmcnt(0)
405 ; VI-NEXT: v_mov_b32_e32 v0, s0
406 ; VI-NEXT: s_load_dword s0, s[2:3], 0x0
407 ; VI-NEXT: v_mov_b32_e32 v1, s1
408 ; VI-NEXT: s_lshl_b32 s1, s4, 16
409 ; VI-NEXT: s_waitcnt lgkmcnt(0)
410 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
411 ; VI-NEXT: s_or_b32 s0, s0, s1
412 ; VI-NEXT: v_mov_b32_e32 v2, s0
413 ; VI-NEXT: flat_store_dword v[0:1], v2
416 ; CI-LABEL: s_insertelement_v2i16_1_reg:
418 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
419 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
420 ; CI-NEXT: s_waitcnt lgkmcnt(0)
421 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
422 ; CI-NEXT: v_mov_b32_e32 v0, s0
423 ; CI-NEXT: v_mov_b32_e32 v1, s1
424 ; CI-NEXT: s_lshl_b32 s1, s4, 16
425 ; CI-NEXT: s_waitcnt lgkmcnt(0)
426 ; CI-NEXT: s_and_b32 s0, s2, 0xffff
427 ; CI-NEXT: s_or_b32 s0, s0, s1
428 ; CI-NEXT: v_mov_b32_e32 v2, s0
429 ; CI-NEXT: flat_store_dword v[0:1], v2
431 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
432 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
433 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
437 define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
438 ; GFX9-LABEL: s_insertelement_v2f16_0:
440 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
441 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
442 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
443 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
444 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
445 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
446 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2
447 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
448 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
449 ; GFX9-NEXT: s_endpgm
451 ; CIVI-LABEL: s_insertelement_v2f16_0:
453 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
454 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
455 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
456 ; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
457 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
458 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
459 ; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000
460 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
461 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
462 ; CIVI-NEXT: flat_store_dword v[0:1], v2
463 ; CIVI-NEXT: s_endpgm
464 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
465 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
466 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
470 define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
471 ; GFX9-LABEL: s_insertelement_v2f16_1:
473 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
474 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
475 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
476 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
477 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500
479 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
480 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
481 ; GFX9-NEXT: s_endpgm
483 ; CIVI-LABEL: s_insertelement_v2f16_1:
485 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
486 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
487 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
488 ; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0
489 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
490 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
491 ; CIVI-NEXT: s_and_b32 s0, s0, 0xffff
492 ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000
493 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
494 ; CIVI-NEXT: flat_store_dword v[0:1], v2
495 ; CIVI-NEXT: s_endpgm
496 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
497 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
498 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
502 define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
503 ; GFX9-LABEL: v_insertelement_v2i16_0:
505 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
506 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
507 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
508 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
509 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
510 ; GFX9-NEXT: s_movk_i32 s2, 0x3e7
511 ; GFX9-NEXT: s_waitcnt vmcnt(0)
512 ; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1
513 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
514 ; GFX9-NEXT: s_endpgm
516 ; VI-LABEL: v_insertelement_v2i16_0:
518 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
519 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
520 ; VI-NEXT: s_waitcnt lgkmcnt(0)
521 ; VI-NEXT: v_mov_b32_e32 v1, s3
522 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
523 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
524 ; VI-NEXT: flat_load_dword v0, v[0:1]
525 ; VI-NEXT: v_mov_b32_e32 v3, s1
526 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
527 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
528 ; VI-NEXT: s_waitcnt vmcnt(0)
529 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
530 ; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0
531 ; VI-NEXT: flat_store_dword v[2:3], v0
534 ; CI-LABEL: v_insertelement_v2i16_0:
536 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
537 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
538 ; CI-NEXT: s_waitcnt lgkmcnt(0)
539 ; CI-NEXT: v_mov_b32_e32 v1, s3
540 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
541 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
542 ; CI-NEXT: flat_load_dword v0, v[0:1]
543 ; CI-NEXT: v_mov_b32_e32 v3, s1
544 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
545 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
546 ; CI-NEXT: s_waitcnt vmcnt(0)
547 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
548 ; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0
549 ; CI-NEXT: flat_store_dword v[2:3], v0
551 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
552 %tid.ext = sext i32 %tid to i64
553 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
554 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
555 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
556 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
557 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
561 define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
562 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
564 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
565 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
566 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
567 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000
568 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
570 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s6
571 ; GFX9-NEXT: s_waitcnt vmcnt(0)
572 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2
573 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
574 ; GFX9-NEXT: s_endpgm
576 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
578 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
579 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
580 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
581 ; VI-NEXT: s_waitcnt lgkmcnt(0)
582 ; VI-NEXT: v_mov_b32_e32 v1, s3
583 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
584 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
585 ; VI-NEXT: flat_load_dword v0, v[0:1]
586 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
587 ; VI-NEXT: v_mov_b32_e32 v3, s1
588 ; VI-NEXT: s_lshr_b32 s0, s4, 16
589 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
590 ; VI-NEXT: s_waitcnt vmcnt(0)
591 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
592 ; VI-NEXT: v_or_b32_e32 v0, s0, v0
593 ; VI-NEXT: flat_store_dword v[2:3], v0
596 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
598 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
599 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
600 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
601 ; CI-NEXT: s_waitcnt lgkmcnt(0)
602 ; CI-NEXT: v_mov_b32_e32 v1, s3
603 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
604 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
605 ; CI-NEXT: flat_load_dword v3, v[0:1]
606 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
607 ; CI-NEXT: v_mov_b32_e32 v1, s1
608 ; CI-NEXT: s_lshr_b32 s0, s4, 16
609 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
610 ; CI-NEXT: s_waitcnt vmcnt(0)
611 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
612 ; CI-NEXT: v_or_b32_e32 v2, s0, v2
613 ; CI-NEXT: flat_store_dword v[0:1], v2
615 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
616 %tid.ext = sext i32 %tid to i64
617 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
618 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
619 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
620 %elt.hi = lshr i32 %elt.arg, 16
621 %elt = trunc i32 %elt.hi to i16
622 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
623 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
627 define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
628 ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
630 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
631 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
632 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
633 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
634 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
635 ; GFX9-NEXT: s_waitcnt vmcnt(0)
636 ; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1
637 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
638 ; GFX9-NEXT: s_endpgm
640 ; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
642 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
643 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
644 ; VI-NEXT: s_waitcnt lgkmcnt(0)
645 ; VI-NEXT: v_mov_b32_e32 v1, s3
646 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
647 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
648 ; VI-NEXT: flat_load_dword v0, v[0:1]
649 ; VI-NEXT: v_mov_b32_e32 v3, s1
650 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
651 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
652 ; VI-NEXT: s_waitcnt vmcnt(0)
653 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
654 ; VI-NEXT: v_or_b32_e32 v0, 53, v0
655 ; VI-NEXT: flat_store_dword v[2:3], v0
658 ; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
660 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
661 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
662 ; CI-NEXT: s_waitcnt lgkmcnt(0)
663 ; CI-NEXT: v_mov_b32_e32 v1, s3
664 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
665 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
666 ; CI-NEXT: flat_load_dword v0, v[0:1]
667 ; CI-NEXT: v_mov_b32_e32 v3, s1
668 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
669 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
670 ; CI-NEXT: s_waitcnt vmcnt(0)
671 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
672 ; CI-NEXT: v_or_b32_e32 v0, 53, v0
673 ; CI-NEXT: flat_store_dword v[2:3], v0
675 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
676 %tid.ext = sext i32 %tid to i64
677 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
678 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
679 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
680 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
681 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
685 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
686 define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
687 ; GFX9-LABEL: v_insertelement_v2i16_1:
689 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
690 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
691 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
692 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
693 ; GFX9-NEXT: s_movk_i32 s2, 0x3e7
694 ; GFX9-NEXT: s_waitcnt vmcnt(0)
695 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
696 ; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1
697 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
698 ; GFX9-NEXT: s_endpgm
700 ; VI-LABEL: v_insertelement_v2i16_1:
702 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
703 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
704 ; VI-NEXT: s_waitcnt lgkmcnt(0)
705 ; VI-NEXT: v_mov_b32_e32 v1, s3
706 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
707 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
708 ; VI-NEXT: flat_load_dword v0, v[0:1]
709 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000
710 ; VI-NEXT: v_mov_b32_e32 v3, s1
711 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
712 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
713 ; VI-NEXT: s_waitcnt vmcnt(0)
714 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
715 ; VI-NEXT: flat_store_dword v[2:3], v0
718 ; CI-LABEL: v_insertelement_v2i16_1:
720 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
721 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
722 ; CI-NEXT: s_waitcnt lgkmcnt(0)
723 ; CI-NEXT: v_mov_b32_e32 v1, s3
724 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
725 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
726 ; CI-NEXT: flat_load_dword v0, v[0:1]
727 ; CI-NEXT: v_mov_b32_e32 v3, s1
728 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
729 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
730 ; CI-NEXT: s_waitcnt vmcnt(0)
731 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
732 ; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0
733 ; CI-NEXT: flat_store_dword v[2:3], v0
735 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
736 %tid.ext = sext i32 %tid to i64
737 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
738 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
739 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
740 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
741 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
745 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
746 ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
748 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
749 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
750 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
751 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
752 ; GFX9-NEXT: s_waitcnt vmcnt(0)
753 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
754 ; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1
755 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
756 ; GFX9-NEXT: s_endpgm
758 ; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
760 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
761 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
762 ; VI-NEXT: s_waitcnt lgkmcnt(0)
763 ; VI-NEXT: v_mov_b32_e32 v1, s3
764 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
765 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
766 ; VI-NEXT: flat_load_dword v0, v[0:1]
767 ; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000
768 ; VI-NEXT: v_mov_b32_e32 v3, s1
769 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
770 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
771 ; VI-NEXT: s_waitcnt vmcnt(0)
772 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
773 ; VI-NEXT: flat_store_dword v[2:3], v0
776 ; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
778 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
779 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
780 ; CI-NEXT: s_waitcnt lgkmcnt(0)
781 ; CI-NEXT: v_mov_b32_e32 v1, s3
782 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
783 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
784 ; CI-NEXT: flat_load_dword v0, v[0:1]
785 ; CI-NEXT: v_mov_b32_e32 v3, s1
786 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
787 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
788 ; CI-NEXT: s_waitcnt vmcnt(0)
789 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
790 ; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0
791 ; CI-NEXT: flat_store_dword v[2:3], v0
793 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
794 %tid.ext = sext i32 %tid to i64
795 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
796 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
797 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
798 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
799 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
803 define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
804 ; GFX9-LABEL: v_insertelement_v2f16_0:
806 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
807 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
808 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500
809 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
810 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
811 ; GFX9-NEXT: s_waitcnt vmcnt(0)
812 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
813 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
814 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
815 ; GFX9-NEXT: s_endpgm
817 ; VI-LABEL: v_insertelement_v2f16_0:
819 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
820 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
821 ; VI-NEXT: s_waitcnt lgkmcnt(0)
822 ; VI-NEXT: v_mov_b32_e32 v1, s3
823 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
824 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
825 ; VI-NEXT: flat_load_dword v0, v[0:1]
826 ; VI-NEXT: v_mov_b32_e32 v3, s1
827 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
828 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
829 ; VI-NEXT: s_waitcnt vmcnt(0)
830 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
831 ; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0
832 ; VI-NEXT: flat_store_dword v[2:3], v0
835 ; CI-LABEL: v_insertelement_v2f16_0:
837 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
838 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
839 ; CI-NEXT: s_waitcnt lgkmcnt(0)
840 ; CI-NEXT: v_mov_b32_e32 v1, s3
841 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
842 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
843 ; CI-NEXT: flat_load_dword v0, v[0:1]
844 ; CI-NEXT: v_mov_b32_e32 v3, s1
845 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
846 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
847 ; CI-NEXT: s_waitcnt vmcnt(0)
848 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
849 ; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0
850 ; CI-NEXT: flat_store_dword v[2:3], v0
852 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
853 %tid.ext = sext i32 %tid to i64
854 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
855 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
856 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
857 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
858 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
862 define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
863 ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
865 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
866 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
867 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
868 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
869 ; GFX9-NEXT: s_waitcnt vmcnt(0)
870 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
871 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53
872 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
873 ; GFX9-NEXT: s_endpgm
875 ; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
877 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
878 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
879 ; VI-NEXT: s_waitcnt lgkmcnt(0)
880 ; VI-NEXT: v_mov_b32_e32 v1, s3
881 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
882 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
883 ; VI-NEXT: flat_load_dword v0, v[0:1]
884 ; VI-NEXT: v_mov_b32_e32 v3, s1
885 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
886 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
887 ; VI-NEXT: s_waitcnt vmcnt(0)
888 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
889 ; VI-NEXT: v_or_b32_e32 v0, 53, v0
890 ; VI-NEXT: flat_store_dword v[2:3], v0
893 ; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
895 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
896 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
897 ; CI-NEXT: s_waitcnt lgkmcnt(0)
898 ; CI-NEXT: v_mov_b32_e32 v1, s3
899 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
900 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
901 ; CI-NEXT: flat_load_dword v0, v[0:1]
902 ; CI-NEXT: v_mov_b32_e32 v3, s1
903 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
904 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
905 ; CI-NEXT: s_waitcnt vmcnt(0)
906 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
907 ; CI-NEXT: v_or_b32_e32 v0, 53, v0
908 ; CI-NEXT: flat_store_dword v[2:3], v0
910 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
911 %tid.ext = sext i32 %tid to i64
912 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
913 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
914 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
915 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
916 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
920 define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
921 ; GFX9-LABEL: v_insertelement_v2f16_1:
923 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
924 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
925 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
926 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
927 ; GFX9-NEXT: s_movk_i32 s2, 0x4500
928 ; GFX9-NEXT: s_waitcnt vmcnt(0)
929 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
930 ; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1
931 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
932 ; GFX9-NEXT: s_endpgm
934 ; VI-LABEL: v_insertelement_v2f16_1:
936 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
937 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
938 ; VI-NEXT: s_waitcnt lgkmcnt(0)
939 ; VI-NEXT: v_mov_b32_e32 v1, s3
940 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
941 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
942 ; VI-NEXT: flat_load_dword v0, v[0:1]
943 ; VI-NEXT: v_mov_b32_e32 v1, 0x45000000
944 ; VI-NEXT: v_mov_b32_e32 v3, s1
945 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
946 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
947 ; VI-NEXT: s_waitcnt vmcnt(0)
948 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
949 ; VI-NEXT: flat_store_dword v[2:3], v0
952 ; CI-LABEL: v_insertelement_v2f16_1:
954 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
955 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
956 ; CI-NEXT: s_waitcnt lgkmcnt(0)
957 ; CI-NEXT: v_mov_b32_e32 v1, s3
958 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
959 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
960 ; CI-NEXT: flat_load_dword v0, v[0:1]
961 ; CI-NEXT: v_mov_b32_e32 v3, s1
962 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
963 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
964 ; CI-NEXT: s_waitcnt vmcnt(0)
965 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
966 ; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0
967 ; CI-NEXT: flat_store_dword v[2:3], v0
969 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
970 %tid.ext = sext i32 %tid to i64
971 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
972 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
973 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
974 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
975 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
979 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
980 ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
982 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
983 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
984 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
986 ; GFX9-NEXT: s_waitcnt vmcnt(0)
987 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
988 ; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1
989 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
990 ; GFX9-NEXT: s_endpgm
992 ; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
994 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
995 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
996 ; VI-NEXT: s_waitcnt lgkmcnt(0)
997 ; VI-NEXT: v_mov_b32_e32 v1, s3
998 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
999 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1000 ; VI-NEXT: flat_load_dword v0, v[0:1]
1001 ; VI-NEXT: v_mov_b32_e32 v1, 0x230000
1002 ; VI-NEXT: v_mov_b32_e32 v3, s1
1003 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1004 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1005 ; VI-NEXT: s_waitcnt vmcnt(0)
1006 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1007 ; VI-NEXT: flat_store_dword v[2:3], v0
1010 ; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1012 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1013 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1014 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1015 ; CI-NEXT: v_mov_b32_e32 v1, s3
1016 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1017 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1018 ; CI-NEXT: flat_load_dword v0, v[0:1]
1019 ; CI-NEXT: v_mov_b32_e32 v3, s1
1020 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1021 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1022 ; CI-NEXT: s_waitcnt vmcnt(0)
1023 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1024 ; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0
1025 ; CI-NEXT: flat_store_dword v[2:3], v0
1027 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1028 %tid.ext = sext i32 %tid to i64
1029 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1030 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1031 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1032 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1033 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1037 ; FIXME: Enable for others when argument load not split
1038 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1039 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1041 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1042 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1043 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1044 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1045 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
1046 ; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
1047 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1048 ; GFX9-NEXT: s_lshl_b32 s2, s4, 4
1049 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1050 ; GFX9-NEXT: s_andn2_b32 s3, s5, s2
1051 ; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7
1052 ; GFX9-NEXT: s_or_b32 s2, s2, s3
1053 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1054 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1055 ; GFX9-NEXT: s_endpgm
1057 ; VI-LABEL: s_insertelement_v2i16_dynamic:
1059 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1060 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1061 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1062 ; VI-NEXT: v_mov_b32_e32 v0, s0
1063 ; VI-NEXT: s_load_dword s0, s[4:5], 0x0
1064 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
1065 ; VI-NEXT: v_mov_b32_e32 v1, s1
1066 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1067 ; VI-NEXT: s_lshl_b32 s0, s0, 4
1068 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1069 ; VI-NEXT: s_andn2_b32 s1, s2, s0
1070 ; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1071 ; VI-NEXT: s_or_b32 s0, s0, s1
1072 ; VI-NEXT: v_mov_b32_e32 v2, s0
1073 ; VI-NEXT: flat_store_dword v[0:1], v2
1076 ; CI-LABEL: s_insertelement_v2i16_dynamic:
1078 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1079 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1080 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1081 ; CI-NEXT: v_mov_b32_e32 v0, s0
1082 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0
1083 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
1084 ; CI-NEXT: v_mov_b32_e32 v1, s1
1085 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1086 ; CI-NEXT: s_lshl_b32 s0, s0, 4
1087 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1088 ; CI-NEXT: s_andn2_b32 s1, s2, s0
1089 ; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7
1090 ; CI-NEXT: s_or_b32 s0, s0, s1
1091 ; CI-NEXT: v_mov_b32_e32 v2, s0
1092 ; CI-NEXT: flat_store_dword v[0:1], v2
1094 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1095 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1096 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1097 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1101 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1102 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1104 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1105 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1106 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1107 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1109 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1110 ; GFX9-NEXT: s_lshl_b32 s2, s6, 4
1111 ; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2
1112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1113 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1114 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1115 ; GFX9-NEXT: s_endpgm
1117 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1119 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1120 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1121 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1123 ; VI-NEXT: v_mov_b32_e32 v1, s3
1124 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1125 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126 ; VI-NEXT: flat_load_dword v0, v[0:1]
1127 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1128 ; VI-NEXT: s_lshl_b32 s0, s4, 4
1129 ; VI-NEXT: v_mov_b32_e32 v3, s1
1130 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1131 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7
1132 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1133 ; VI-NEXT: s_waitcnt vmcnt(0)
1134 ; VI-NEXT: v_bfi_b32 v0, s0, v1, v0
1135 ; VI-NEXT: flat_store_dword v[2:3], v0
1138 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1140 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1141 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1142 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1143 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1144 ; CI-NEXT: v_mov_b32_e32 v1, s3
1145 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1146 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1147 ; CI-NEXT: flat_load_dword v0, v[0:1]
1148 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1149 ; CI-NEXT: s_lshl_b32 s0, s4, 4
1150 ; CI-NEXT: v_mov_b32_e32 v3, s1
1151 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1152 ; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7
1153 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1154 ; CI-NEXT: s_waitcnt vmcnt(0)
1155 ; CI-NEXT: v_bfi_b32 v0, s0, v1, v0
1156 ; CI-NEXT: flat_store_dword v[2:3], v0
1158 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1159 %tid.ext = sext i32 %tid to i64
1160 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1161 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1162 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1163 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1164 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1168 define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1169 ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1171 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1172 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1173 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1175 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
1176 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
1177 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
1178 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1179 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1180 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
1181 ; GFX9-NEXT: s_mov_b32 s2, 0x12341234
1182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1183 ; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2
1184 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1185 ; GFX9-NEXT: s_endpgm
1187 ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1189 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1190 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1191 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1192 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1193 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1194 ; VI-NEXT: v_mov_b32_e32 v1, s3
1195 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1196 ; VI-NEXT: v_mov_b32_e32 v3, s5
1197 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1198 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1199 ; VI-NEXT: flat_load_dword v2, v[2:3]
1200 ; VI-NEXT: flat_load_dword v3, v[0:1]
1201 ; VI-NEXT: s_mov_b32 s2, 0xffff
1202 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1203 ; VI-NEXT: v_mov_b32_e32 v1, s1
1204 ; VI-NEXT: s_mov_b32 s0, 0x12341234
1205 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1206 ; VI-NEXT: s_waitcnt vmcnt(1)
1207 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
1208 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
1209 ; VI-NEXT: s_waitcnt vmcnt(0)
1210 ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
1211 ; VI-NEXT: flat_store_dword v[0:1], v2
1214 ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1216 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1217 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1218 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1219 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1220 ; CI-NEXT: v_mov_b32_e32 v1, s3
1221 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1222 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1223 ; CI-NEXT: v_mov_b32_e32 v3, s5
1224 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1225 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1226 ; CI-NEXT: flat_load_dword v2, v[2:3]
1227 ; CI-NEXT: flat_load_dword v3, v[0:1]
1228 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1229 ; CI-NEXT: v_mov_b32_e32 v1, s1
1230 ; CI-NEXT: s_mov_b32 s0, 0x12341234
1231 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1232 ; CI-NEXT: s_waitcnt vmcnt(1)
1233 ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
1234 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
1235 ; CI-NEXT: s_waitcnt vmcnt(0)
1236 ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
1237 ; CI-NEXT: flat_store_dword v[0:1], v2
1239 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1240 %tid.ext = sext i32 %tid to i64
1241 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1242 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1243 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1244 %idx = load i32, i32 addrspace(1)* %idx.gep
1245 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1246 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1247 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1251 define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1252 ; GFX9-LABEL: v_insertelement_v4f16_0:
1254 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1255 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
1256 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1257 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
1258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1259 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1261 ; GFX9-NEXT: v_bfi_b32 v0, v3, s6, v0
1262 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1263 ; GFX9-NEXT: s_endpgm
1265 ; VI-LABEL: v_insertelement_v4f16_0:
1267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1268 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1269 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1271 ; VI-NEXT: v_mov_b32_e32 v1, s3
1272 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1273 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1275 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1276 ; VI-NEXT: v_mov_b32_e32 v3, s1
1277 ; VI-NEXT: s_mov_b32 s0, 0xffff
1278 ; VI-NEXT: v_mov_b32_e32 v4, s4
1279 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1280 ; VI-NEXT: s_waitcnt vmcnt(0)
1281 ; VI-NEXT: v_bfi_b32 v0, s0, v4, v0
1282 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1285 ; CI-LABEL: v_insertelement_v4f16_0:
1287 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1288 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1289 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1290 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1291 ; CI-NEXT: v_mov_b32_e32 v1, s3
1292 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1293 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1294 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1295 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1296 ; CI-NEXT: v_mov_b32_e32 v3, s1
1297 ; CI-NEXT: s_mov_b32 s0, 0xffff
1298 ; CI-NEXT: v_mov_b32_e32 v4, s4
1299 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1300 ; CI-NEXT: s_waitcnt vmcnt(0)
1301 ; CI-NEXT: v_bfi_b32 v0, s0, v4, v0
1302 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1304 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1305 %tid.ext = sext i32 %tid to i64
1306 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1307 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1308 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1309 %val.trunc = trunc i32 %val to i16
1310 %val.cvt = bitcast i16 %val.trunc to half
1311 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1312 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1316 define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1317 ; GFX9-LABEL: v_insertelement_v4f16_1:
1319 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1320 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1321 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1323 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1325 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
1326 ; GFX9-NEXT: v_lshl_or_b32 v0, s6, 16, v0
1327 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1328 ; GFX9-NEXT: s_endpgm
1330 ; VI-LABEL: v_insertelement_v4f16_1:
1332 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1333 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1334 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1335 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1336 ; VI-NEXT: v_mov_b32_e32 v1, s3
1337 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1338 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1339 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1340 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1341 ; VI-NEXT: s_lshl_b32 s0, s4, 16
1342 ; VI-NEXT: v_mov_b32_e32 v3, s1
1343 ; VI-NEXT: v_mov_b32_e32 v4, s0
1344 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1345 ; VI-NEXT: s_waitcnt vmcnt(0)
1346 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1347 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1350 ; CI-LABEL: v_insertelement_v4f16_1:
1352 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1353 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1354 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1355 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1356 ; CI-NEXT: v_mov_b32_e32 v1, s3
1357 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1358 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1359 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1360 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1361 ; CI-NEXT: v_mov_b32_e32 v3, s1
1362 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1363 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1364 ; CI-NEXT: s_waitcnt vmcnt(0)
1365 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1366 ; CI-NEXT: v_or_b32_e32 v0, s0, v0
1367 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1369 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1370 %tid.ext = sext i32 %tid to i64
1371 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1372 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1373 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1374 %val.trunc = trunc i32 %val to i16
1375 %val.cvt = bitcast i16 %val.trunc to half
1376 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1377 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1381 define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1382 ; GFX9-LABEL: v_insertelement_v4f16_2:
1384 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1385 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x30
1386 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1387 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
1388 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1389 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1391 ; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1
1392 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1393 ; GFX9-NEXT: s_endpgm
1395 ; VI-LABEL: v_insertelement_v4f16_2:
1397 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1398 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1399 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1400 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1401 ; VI-NEXT: v_mov_b32_e32 v1, s3
1402 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1403 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1404 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1405 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1406 ; VI-NEXT: v_mov_b32_e32 v3, s1
1407 ; VI-NEXT: s_mov_b32 s0, 0xffff
1408 ; VI-NEXT: v_mov_b32_e32 v4, s4
1409 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1410 ; VI-NEXT: s_waitcnt vmcnt(0)
1411 ; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
1412 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1415 ; CI-LABEL: v_insertelement_v4f16_2:
1417 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1418 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1419 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1420 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1421 ; CI-NEXT: v_mov_b32_e32 v1, s3
1422 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1423 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1424 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1425 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1426 ; CI-NEXT: v_mov_b32_e32 v3, s1
1427 ; CI-NEXT: s_mov_b32 s0, 0xffff
1428 ; CI-NEXT: v_mov_b32_e32 v4, s4
1429 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1430 ; CI-NEXT: s_waitcnt vmcnt(0)
1431 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1432 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1434 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1435 %tid.ext = sext i32 %tid to i64
1436 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1437 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1438 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1439 %val.trunc = trunc i32 %val to i16
1440 %val.cvt = bitcast i16 %val.trunc to half
1441 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1442 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1446 define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1447 ; GFX9-LABEL: v_insertelement_v4f16_3:
1449 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1450 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1451 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1452 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1453 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1455 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1456 ; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1
1457 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1458 ; GFX9-NEXT: s_endpgm
1460 ; VI-LABEL: v_insertelement_v4f16_3:
1462 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1463 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1464 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1465 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1466 ; VI-NEXT: v_mov_b32_e32 v1, s3
1467 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1468 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1469 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1470 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1471 ; VI-NEXT: s_lshl_b32 s0, s4, 16
1472 ; VI-NEXT: v_mov_b32_e32 v3, s1
1473 ; VI-NEXT: v_mov_b32_e32 v4, s0
1474 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1475 ; VI-NEXT: s_waitcnt vmcnt(0)
1476 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1477 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1480 ; CI-LABEL: v_insertelement_v4f16_3:
1482 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1483 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1484 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1485 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1486 ; CI-NEXT: v_mov_b32_e32 v1, s3
1487 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1488 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1489 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1490 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1491 ; CI-NEXT: v_mov_b32_e32 v3, s1
1492 ; CI-NEXT: s_lshl_b32 s0, s4, 16
1493 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1494 ; CI-NEXT: s_waitcnt vmcnt(0)
1495 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1496 ; CI-NEXT: v_or_b32_e32 v1, s0, v1
1497 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1499 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1500 %tid.ext = sext i32 %tid to i64
1501 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1502 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1503 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1504 %val.trunc = trunc i32 %val to i16
1505 %val.cvt = bitcast i16 %val.trunc to half
1506 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1507 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1511 define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1512 ; GFX9-LABEL: v_insertelement_v4i16_2:
1514 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1515 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1516 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1517 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
1518 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1519 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1520 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1521 ; GFX9-NEXT: v_bfi_b32 v1, v3, s6, v1
1522 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1523 ; GFX9-NEXT: s_endpgm
1525 ; VI-LABEL: v_insertelement_v4i16_2:
1527 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1528 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1529 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1530 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1531 ; VI-NEXT: v_mov_b32_e32 v1, s3
1532 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1533 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1534 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1535 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1536 ; VI-NEXT: v_mov_b32_e32 v3, s1
1537 ; VI-NEXT: s_mov_b32 s0, 0xffff
1538 ; VI-NEXT: v_mov_b32_e32 v4, s4
1539 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1540 ; VI-NEXT: s_waitcnt vmcnt(0)
1541 ; VI-NEXT: v_bfi_b32 v1, s0, v4, v1
1542 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1545 ; CI-LABEL: v_insertelement_v4i16_2:
1547 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1548 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1549 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1550 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1551 ; CI-NEXT: v_mov_b32_e32 v1, s3
1552 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1553 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1554 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1555 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1556 ; CI-NEXT: v_mov_b32_e32 v3, s1
1557 ; CI-NEXT: s_mov_b32 s0, 0xffff
1558 ; CI-NEXT: v_mov_b32_e32 v4, s4
1559 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1560 ; CI-NEXT: s_waitcnt vmcnt(0)
1561 ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1
1562 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1564 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1565 %tid.ext = sext i32 %tid to i64
1566 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1567 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1568 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1569 %val.trunc = trunc i32 %val to i16
1570 %val.cvt = bitcast i16 %val.trunc to i16
1571 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1572 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1576 ; FIXME: Better code on CI?
1577 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1578 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1580 ; GFX9-NEXT: global_load_dword v2, v[0:1], off glc
1581 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1582 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1583 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1584 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1586 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1587 ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
1588 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
1589 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
1590 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s6
1591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1592 ; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1
1593 ; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0
1594 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
1595 ; GFX9-NEXT: s_endpgm
1597 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1599 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1600 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1601 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1602 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1603 ; VI-NEXT: v_mov_b32_e32 v1, s3
1604 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1605 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1606 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
1607 ; VI-NEXT: s_waitcnt vmcnt(0)
1608 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1609 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff
1610 ; VI-NEXT: v_mov_b32_e32 v3, s1
1611 ; VI-NEXT: s_and_b32 s1, s4, s2
1612 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1613 ; VI-NEXT: s_lshl_b32 s0, s1, 16
1614 ; VI-NEXT: s_or_b32 s0, s1, s0
1615 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1616 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
1617 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
1618 ; VI-NEXT: s_waitcnt vmcnt(0)
1619 ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
1620 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
1621 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1624 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1626 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1627 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1628 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1629 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1630 ; CI-NEXT: v_mov_b32_e32 v1, s3
1631 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1632 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1633 ; CI-NEXT: flat_load_dword v4, v[0:1] glc
1634 ; CI-NEXT: s_waitcnt vmcnt(0)
1635 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1636 ; CI-NEXT: s_mov_b64 s[2:3], 0xffff
1637 ; CI-NEXT: v_mov_b32_e32 v3, s1
1638 ; CI-NEXT: s_lshl_b32 s1, s4, 16
1639 ; CI-NEXT: s_and_b32 s4, s4, s2
1640 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1641 ; CI-NEXT: s_or_b32 s0, s4, s1
1642 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1643 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
1644 ; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
1645 ; CI-NEXT: s_waitcnt vmcnt(0)
1646 ; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
1647 ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
1648 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1650 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1651 %tid.ext = sext i32 %tid to i64
1652 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1653 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1654 %idx.val = load volatile i32, i32 addrspace(1)* undef
1655 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1656 %val.trunc = trunc i32 %val to i16
1657 %val.cvt = bitcast i16 %val.trunc to i16
1658 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
1659 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1663 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
1664 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1666 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1667 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1668 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1669 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1670 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
1671 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
1672 ; GFX9-NEXT: s_mov_b64 s[2:3], 0xffff
1673 ; GFX9-NEXT: s_lshl_b32 s4, s7, 4
1674 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
1675 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
1676 ; GFX9-NEXT: v_mov_b32_e32 v4, s5
1677 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1678 ; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1
1679 ; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0
1680 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
1681 ; GFX9-NEXT: s_endpgm
1683 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1685 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1686 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1687 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1688 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1689 ; VI-NEXT: v_mov_b32_e32 v1, s3
1690 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1691 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1692 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1693 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff
1694 ; VI-NEXT: v_mov_b32_e32 v3, s1
1695 ; VI-NEXT: s_lshl_b32 s1, s5, 4
1696 ; VI-NEXT: s_and_b32 s4, s4, s2
1697 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1698 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
1699 ; VI-NEXT: s_lshl_b32 s2, s4, 16
1700 ; VI-NEXT: s_or_b32 s2, s4, s2
1701 ; VI-NEXT: v_mov_b32_e32 v4, s2
1702 ; VI-NEXT: v_mov_b32_e32 v5, s2
1703 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1704 ; VI-NEXT: s_waitcnt vmcnt(0)
1705 ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1
1706 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0
1707 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1710 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1712 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1713 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1714 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1715 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1716 ; CI-NEXT: v_mov_b32_e32 v1, s3
1717 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1718 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1719 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1720 ; CI-NEXT: s_mov_b64 s[2:3], 0xffff
1721 ; CI-NEXT: v_mov_b32_e32 v3, s1
1722 ; CI-NEXT: s_and_b32 s6, s4, s2
1723 ; CI-NEXT: s_lshl_b32 s1, s5, 4
1724 ; CI-NEXT: s_lshl_b32 s4, s4, 16
1725 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1726 ; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
1727 ; CI-NEXT: s_or_b32 s2, s6, s4
1728 ; CI-NEXT: v_mov_b32_e32 v4, s2
1729 ; CI-NEXT: v_mov_b32_e32 v5, s2
1730 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1731 ; CI-NEXT: s_waitcnt vmcnt(0)
1732 ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1
1733 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0
1734 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1736 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1737 %tid.ext = sext i32 %tid to i64
1738 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1739 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1740 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1741 %val.trunc = trunc i32 %val to i16
1742 %val.cvt = bitcast i16 %val.trunc to half
1743 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
1744 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1748 declare i32 @llvm.amdgcn.workitem.id.x() #1
1750 attributes #0 = { nounwind }
1751 attributes #1 = { nounwind readnone }