1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,CIVI,VI %s
4 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
6 define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
7 ; GFX9-LABEL: s_insertelement_v2i16_0:
9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
11 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
12 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
13 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s2
16 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
17 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
20 ; CIVI-LABEL: s_insertelement_v2i16_0:
22 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
23 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
24 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
25 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
26 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
27 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
28 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
29 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7
30 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
31 ; CIVI-NEXT: flat_store_dword v[0:1], v2
33 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
34 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
35 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
40 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
41 ; GFX9-LABEL: s_insertelement_v2i16_0_reg:
43 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
44 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
45 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
47 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
48 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
49 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
50 ; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s2
51 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
52 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
55 ; VI-LABEL: s_insertelement_v2i16_0_reg:
57 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
58 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
59 ; VI-NEXT: s_waitcnt lgkmcnt(0)
60 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
61 ; VI-NEXT: v_mov_b32_e32 v0, s0
62 ; VI-NEXT: v_mov_b32_e32 v1, s1
63 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
64 ; VI-NEXT: s_waitcnt lgkmcnt(0)
65 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
66 ; VI-NEXT: s_or_b32 s0, s0, s1
67 ; VI-NEXT: v_mov_b32_e32 v2, s0
68 ; VI-NEXT: flat_store_dword v[0:1], v2
71 ; CI-LABEL: s_insertelement_v2i16_0_reg:
73 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
74 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
75 ; CI-NEXT: s_waitcnt lgkmcnt(0)
76 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
77 ; CI-NEXT: v_mov_b32_e32 v0, s0
78 ; CI-NEXT: v_mov_b32_e32 v1, s1
79 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
80 ; CI-NEXT: s_waitcnt lgkmcnt(0)
81 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
82 ; CI-NEXT: s_or_b32 s0, s1, s0
83 ; CI-NEXT: v_mov_b32_e32 v2, s0
84 ; CI-NEXT: flat_store_dword v[0:1], v2
86 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
87 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
88 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
92 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
93 ; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
95 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
96 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
97 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
98 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
99 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
100 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
101 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16
103 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0
104 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
105 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
106 ; GFX9-NEXT: ;;#ASMSTART
107 ; GFX9-NEXT: ; use s0
108 ; GFX9-NEXT: ;;#ASMEND
109 ; GFX9-NEXT: s_endpgm
111 ; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
113 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
114 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
115 ; VI-NEXT: s_waitcnt lgkmcnt(0)
116 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
117 ; VI-NEXT: v_mov_b32_e32 v0, s0
118 ; VI-NEXT: v_mov_b32_e32 v1, s1
119 ; VI-NEXT: s_and_b32 s0, s4, 0xffff
120 ; VI-NEXT: s_waitcnt lgkmcnt(0)
121 ; VI-NEXT: s_lshr_b32 s1, s2, 16
122 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
123 ; VI-NEXT: s_or_b32 s0, s0, s2
124 ; VI-NEXT: v_mov_b32_e32 v2, s0
125 ; VI-NEXT: flat_store_dword v[0:1], v2
126 ; VI-NEXT: ;;#ASMSTART
131 ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
133 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
134 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
135 ; CI-NEXT: s_waitcnt lgkmcnt(0)
136 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
137 ; CI-NEXT: v_mov_b32_e32 v1, s1
138 ; CI-NEXT: v_mov_b32_e32 v0, s0
139 ; CI-NEXT: s_and_b32 s0, s4, 0xffff
140 ; CI-NEXT: s_waitcnt lgkmcnt(0)
141 ; CI-NEXT: s_lshr_b32 s1, s2, 16
142 ; CI-NEXT: s_lshl_b32 s2, s1, 16
143 ; CI-NEXT: s_or_b32 s0, s0, s2
144 ; CI-NEXT: v_mov_b32_e32 v2, s0
145 ; CI-NEXT: flat_store_dword v[0:1], v2
146 ; CI-NEXT: ;;#ASMSTART
150 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
151 %elt1 = extractelement <2 x i16> %vec, i32 1
152 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
153 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
154 %use1 = zext i16 %elt1 to i32
155 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
159 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
160 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
162 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
163 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
164 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
166 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
167 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
169 ; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s2
170 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
171 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
172 ; GFX9-NEXT: s_endpgm
174 ; VI-LABEL: s_insertelement_v2i16_0_reghi:
176 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
177 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
178 ; VI-NEXT: s_waitcnt lgkmcnt(0)
179 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
180 ; VI-NEXT: v_mov_b32_e32 v0, s0
181 ; VI-NEXT: v_mov_b32_e32 v1, s1
182 ; VI-NEXT: s_lshr_b32 s0, s4, 16
183 ; VI-NEXT: s_waitcnt lgkmcnt(0)
184 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
185 ; VI-NEXT: s_or_b32 s0, s0, s1
186 ; VI-NEXT: v_mov_b32_e32 v2, s0
187 ; VI-NEXT: flat_store_dword v[0:1], v2
190 ; CI-LABEL: s_insertelement_v2i16_0_reghi:
192 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
193 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
194 ; CI-NEXT: s_waitcnt lgkmcnt(0)
195 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
196 ; CI-NEXT: v_mov_b32_e32 v0, s0
197 ; CI-NEXT: v_mov_b32_e32 v1, s1
198 ; CI-NEXT: s_lshr_b32 s1, s4, 16
199 ; CI-NEXT: s_waitcnt lgkmcnt(0)
200 ; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
201 ; CI-NEXT: s_or_b32 s0, s1, s0
202 ; CI-NEXT: v_mov_b32_e32 v2, s0
203 ; CI-NEXT: flat_store_dword v[0:1], v2
205 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
206 %elt.hi = lshr i32 %elt.arg, 16
207 %elt = trunc i32 %elt.hi to i16
208 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
209 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
213 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
214 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
216 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
217 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
218 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
220 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
221 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
222 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
223 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
224 ; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2
225 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
226 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
227 ; GFX9-NEXT: ;;#ASMSTART
228 ; GFX9-NEXT: ; use s0
229 ; GFX9-NEXT: ;;#ASMEND
230 ; GFX9-NEXT: s_endpgm
232 ; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
234 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
235 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
236 ; VI-NEXT: s_waitcnt lgkmcnt(0)
237 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
238 ; VI-NEXT: v_mov_b32_e32 v0, s0
239 ; VI-NEXT: v_mov_b32_e32 v1, s1
240 ; VI-NEXT: s_lshr_b32 s0, s4, 16
241 ; VI-NEXT: s_waitcnt lgkmcnt(0)
242 ; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
243 ; VI-NEXT: s_or_b32 s1, s0, s1
244 ; VI-NEXT: v_mov_b32_e32 v2, s1
245 ; VI-NEXT: flat_store_dword v[0:1], v2
246 ; VI-NEXT: ;;#ASMSTART
251 ; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
253 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
254 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
255 ; CI-NEXT: s_waitcnt lgkmcnt(0)
256 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
257 ; CI-NEXT: v_mov_b32_e32 v0, s0
258 ; CI-NEXT: v_mov_b32_e32 v1, s1
259 ; CI-NEXT: s_lshr_b32 s0, s4, 16
260 ; CI-NEXT: s_waitcnt lgkmcnt(0)
261 ; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
262 ; CI-NEXT: s_or_b32 s1, s0, s1
263 ; CI-NEXT: v_mov_b32_e32 v2, s1
264 ; CI-NEXT: flat_store_dword v[0:1], v2
265 ; CI-NEXT: ;;#ASMSTART
269 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
270 %elt.hi = lshr i32 %elt.arg, 16
271 %elt = trunc i32 %elt.hi to i16
272 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
273 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
274 %use1 = zext i16 %elt to i32
275 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
279 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
280 ; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
282 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
283 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
284 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
286 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
287 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
288 ; GFX9-NEXT: s_lshr_b32 s0, s4, 16
289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16
291 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s0, s1
292 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
293 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
294 ; GFX9-NEXT: ;;#ASMSTART
295 ; GFX9-NEXT: ; use s0
296 ; GFX9-NEXT: ;;#ASMEND
297 ; GFX9-NEXT: ;;#ASMSTART
298 ; GFX9-NEXT: ; use s1
299 ; GFX9-NEXT: ;;#ASMEND
300 ; GFX9-NEXT: s_endpgm
302 ; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
304 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
305 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
306 ; VI-NEXT: s_waitcnt lgkmcnt(0)
307 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
308 ; VI-NEXT: v_mov_b32_e32 v0, s0
309 ; VI-NEXT: v_mov_b32_e32 v1, s1
310 ; VI-NEXT: s_lshr_b32 s0, s4, 16
311 ; VI-NEXT: s_waitcnt lgkmcnt(0)
312 ; VI-NEXT: s_lshr_b32 s1, s2, 16
313 ; VI-NEXT: s_and_b32 s2, s2, 0xffff0000
314 ; VI-NEXT: s_or_b32 s2, s0, s2
315 ; VI-NEXT: v_mov_b32_e32 v2, s2
316 ; VI-NEXT: flat_store_dword v[0:1], v2
317 ; VI-NEXT: ;;#ASMSTART
320 ; VI-NEXT: ;;#ASMSTART
325 ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
327 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
328 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
329 ; CI-NEXT: s_waitcnt lgkmcnt(0)
330 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
331 ; CI-NEXT: v_mov_b32_e32 v1, s1
332 ; CI-NEXT: v_mov_b32_e32 v0, s0
333 ; CI-NEXT: s_lshr_b32 s0, s4, 16
334 ; CI-NEXT: s_waitcnt lgkmcnt(0)
335 ; CI-NEXT: s_lshr_b32 s1, s2, 16
336 ; CI-NEXT: s_lshl_b32 s2, s1, 16
337 ; CI-NEXT: s_or_b32 s2, s0, s2
338 ; CI-NEXT: v_mov_b32_e32 v2, s2
339 ; CI-NEXT: flat_store_dword v[0:1], v2
340 ; CI-NEXT: ;;#ASMSTART
343 ; CI-NEXT: ;;#ASMSTART
347 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
348 %elt.hi = lshr i32 %elt.arg, 16
349 %elt = trunc i32 %elt.hi to i16
350 %vec.hi = extractelement <2 x i16> %vec, i32 1
351 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
352 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
353 %use1 = zext i16 %elt to i32
354 %vec.hi.use1 = zext i16 %vec.hi to i32
356 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
357 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
361 define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
362 ; GFX9-LABEL: s_insertelement_v2i16_1:
364 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
365 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
367 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
368 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x3e7
371 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
372 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
373 ; GFX9-NEXT: s_endpgm
375 ; CIVI-LABEL: s_insertelement_v2i16_1:
377 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
378 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
379 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
380 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
381 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
382 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
383 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
384 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000
385 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
386 ; CIVI-NEXT: flat_store_dword v[0:1], v2
387 ; CIVI-NEXT: s_endpgm
388 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
389 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
390 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
394 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
395 ; GFX9-LABEL: s_insertelement_v2i16_1_reg:
397 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
398 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
399 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
401 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
402 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
403 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s4
405 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
406 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
407 ; GFX9-NEXT: s_endpgm
409 ; VI-LABEL: s_insertelement_v2i16_1_reg:
411 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
412 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
413 ; VI-NEXT: s_waitcnt lgkmcnt(0)
414 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
415 ; VI-NEXT: v_mov_b32_e32 v0, s0
416 ; VI-NEXT: v_mov_b32_e32 v1, s1
417 ; VI-NEXT: s_lshl_b32 s0, s4, 16
418 ; VI-NEXT: s_waitcnt lgkmcnt(0)
419 ; VI-NEXT: s_and_b32 s1, s2, 0xffff
420 ; VI-NEXT: s_or_b32 s0, s1, s0
421 ; VI-NEXT: v_mov_b32_e32 v2, s0
422 ; VI-NEXT: flat_store_dword v[0:1], v2
425 ; CI-LABEL: s_insertelement_v2i16_1_reg:
427 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
428 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
429 ; CI-NEXT: s_waitcnt lgkmcnt(0)
430 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0
431 ; CI-NEXT: v_mov_b32_e32 v0, s0
432 ; CI-NEXT: v_mov_b32_e32 v1, s1
433 ; CI-NEXT: s_lshl_b32 s1, s4, 16
434 ; CI-NEXT: s_waitcnt lgkmcnt(0)
435 ; CI-NEXT: s_and_b32 s0, s2, 0xffff
436 ; CI-NEXT: s_or_b32 s0, s0, s1
437 ; CI-NEXT: v_mov_b32_e32 v2, s0
438 ; CI-NEXT: flat_store_dword v[0:1], v2
440 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
441 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
442 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
446 define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
447 ; GFX9-LABEL: s_insertelement_v2f16_0:
449 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
450 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
451 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
452 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
453 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
454 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
455 ; GFX9-NEXT: s_lshr_b32 s0, s2, 16
456 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0
457 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
458 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
459 ; GFX9-NEXT: s_endpgm
461 ; CIVI-LABEL: s_insertelement_v2f16_0:
463 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
464 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
465 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
466 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
467 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
468 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
469 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000
470 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500
471 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
472 ; CIVI-NEXT: flat_store_dword v[0:1], v2
473 ; CIVI-NEXT: s_endpgm
474 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
475 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
476 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
480 define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
481 ; GFX9-LABEL: s_insertelement_v2f16_1:
483 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
484 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
486 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
487 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
488 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
489 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x4500
490 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
491 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
492 ; GFX9-NEXT: s_endpgm
494 ; CIVI-LABEL: s_insertelement_v2f16_1:
496 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
497 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
498 ; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
499 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
500 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
501 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
502 ; CIVI-NEXT: s_and_b32 s0, s2, 0xffff
503 ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000
504 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
505 ; CIVI-NEXT: flat_store_dword v[0:1], v2
506 ; CIVI-NEXT: s_endpgm
507 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
508 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
509 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
513 define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
514 ; GFX9-LABEL: v_insertelement_v2i16_0:
516 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
517 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
518 ; GFX9-NEXT: s_movk_i32 s4, 0x3e7
519 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
520 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
522 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
523 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
524 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
525 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
526 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
527 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
529 ; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4
530 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
531 ; GFX9-NEXT: s_endpgm
533 ; VI-LABEL: v_insertelement_v2i16_0:
535 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
536 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
537 ; VI-NEXT: s_waitcnt lgkmcnt(0)
538 ; VI-NEXT: v_mov_b32_e32 v1, s3
539 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
540 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
541 ; VI-NEXT: flat_load_dword v3, v[0:1]
542 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
543 ; VI-NEXT: v_mov_b32_e32 v1, s1
544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
545 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
546 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
547 ; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
548 ; VI-NEXT: flat_store_dword v[0:1], v2
551 ; CI-LABEL: v_insertelement_v2i16_0:
553 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
554 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
555 ; CI-NEXT: s_waitcnt lgkmcnt(0)
556 ; CI-NEXT: v_mov_b32_e32 v1, s3
557 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
558 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
559 ; CI-NEXT: flat_load_dword v3, v[0:1]
560 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
561 ; CI-NEXT: v_mov_b32_e32 v1, s1
562 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
563 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
564 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
565 ; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2
566 ; CI-NEXT: flat_store_dword v[0:1], v2
568 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
569 %tid.ext = sext i32 %tid to i64
570 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
571 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
572 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
573 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
574 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
578 define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
579 ; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
581 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
582 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
583 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
584 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000
585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
587 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
588 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
589 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
590 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
591 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
592 ; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4
593 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
594 ; GFX9-NEXT: s_waitcnt vmcnt(0)
595 ; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2
596 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
597 ; GFX9-NEXT: s_endpgm
599 ; VI-LABEL: v_insertelement_v2i16_0_reghi:
601 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
602 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
603 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
604 ; VI-NEXT: s_waitcnt lgkmcnt(0)
605 ; VI-NEXT: v_mov_b32_e32 v1, s3
606 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
607 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
608 ; VI-NEXT: flat_load_dword v3, v[0:1]
609 ; VI-NEXT: v_mov_b32_e32 v1, s1
610 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
611 ; VI-NEXT: s_lshr_b32 s1, s4, 16
612 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
613 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
614 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
615 ; VI-NEXT: v_or_b32_e32 v2, s1, v2
616 ; VI-NEXT: flat_store_dword v[0:1], v2
619 ; CI-LABEL: v_insertelement_v2i16_0_reghi:
621 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
622 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
623 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
624 ; CI-NEXT: s_waitcnt lgkmcnt(0)
625 ; CI-NEXT: v_mov_b32_e32 v1, s3
626 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
627 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
628 ; CI-NEXT: flat_load_dword v3, v[0:1]
629 ; CI-NEXT: v_mov_b32_e32 v1, s1
630 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
631 ; CI-NEXT: s_lshr_b32 s1, s4, 16
632 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
633 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
634 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
635 ; CI-NEXT: v_or_b32_e32 v2, s1, v2
636 ; CI-NEXT: flat_store_dword v[0:1], v2
638 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
639 %tid.ext = sext i32 %tid to i64
640 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
641 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
642 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
643 %elt.hi = lshr i32 %elt.arg, 16
644 %elt = trunc i32 %elt.hi to i16
645 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
646 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
650 define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
651 ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
653 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
654 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
655 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
656 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
657 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
658 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
659 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
660 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
661 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
662 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
663 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
664 ; GFX9-NEXT: s_waitcnt vmcnt(0)
665 ; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4
666 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
667 ; GFX9-NEXT: s_endpgm
669 ; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
671 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
672 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
673 ; VI-NEXT: s_waitcnt lgkmcnt(0)
674 ; VI-NEXT: v_mov_b32_e32 v1, s3
675 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
676 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
677 ; VI-NEXT: flat_load_dword v3, v[0:1]
678 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
679 ; VI-NEXT: v_mov_b32_e32 v1, s1
680 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
681 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
682 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
683 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
684 ; VI-NEXT: flat_store_dword v[0:1], v2
687 ; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
689 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
690 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
691 ; CI-NEXT: s_waitcnt lgkmcnt(0)
692 ; CI-NEXT: v_mov_b32_e32 v1, s3
693 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
694 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
695 ; CI-NEXT: flat_load_dword v3, v[0:1]
696 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
697 ; CI-NEXT: v_mov_b32_e32 v1, s1
698 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
699 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
700 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
701 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
702 ; CI-NEXT: flat_store_dword v[0:1], v2
704 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
705 %tid.ext = sext i32 %tid to i64
706 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
707 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
708 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
709 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
710 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
714 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
715 define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
716 ; GFX9-LABEL: v_insertelement_v2i16_1:
718 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
719 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
720 ; GFX9-NEXT: s_movk_i32 s4, 0x3e7
721 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
723 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
724 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
725 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
726 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
727 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
728 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
729 ; GFX9-NEXT: s_waitcnt vmcnt(0)
730 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
731 ; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
732 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
733 ; GFX9-NEXT: s_endpgm
735 ; VI-LABEL: v_insertelement_v2i16_1:
737 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
738 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
739 ; VI-NEXT: v_mov_b32_e32 v3, 0x3e70000
740 ; VI-NEXT: s_waitcnt lgkmcnt(0)
741 ; VI-NEXT: v_mov_b32_e32 v1, s3
742 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
743 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
744 ; VI-NEXT: flat_load_dword v4, v[0:1]
745 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
746 ; VI-NEXT: v_mov_b32_e32 v1, s1
747 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
748 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
749 ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
750 ; VI-NEXT: flat_store_dword v[0:1], v2
753 ; CI-LABEL: v_insertelement_v2i16_1:
755 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
756 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
757 ; CI-NEXT: s_waitcnt lgkmcnt(0)
758 ; CI-NEXT: v_mov_b32_e32 v1, s3
759 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
760 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
761 ; CI-NEXT: flat_load_dword v3, v[0:1]
762 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
763 ; CI-NEXT: v_mov_b32_e32 v1, s1
764 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
765 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
766 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
767 ; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2
768 ; CI-NEXT: flat_store_dword v[0:1], v2
770 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
771 %tid.ext = sext i32 %tid to i64
772 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
773 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
774 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
775 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
776 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
780 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
781 ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
783 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
784 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
785 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
787 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
788 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
789 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
790 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
791 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
792 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
793 ; GFX9-NEXT: s_waitcnt vmcnt(0)
794 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
795 ; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2
796 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
797 ; GFX9-NEXT: s_endpgm
799 ; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
801 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
802 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
803 ; VI-NEXT: v_mov_b32_e32 v3, 0xfff10000
804 ; VI-NEXT: s_waitcnt lgkmcnt(0)
805 ; VI-NEXT: v_mov_b32_e32 v1, s3
806 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
807 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
808 ; VI-NEXT: flat_load_dword v4, v[0:1]
809 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
810 ; VI-NEXT: v_mov_b32_e32 v1, s1
811 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
812 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
813 ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
814 ; VI-NEXT: flat_store_dword v[0:1], v2
817 ; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
819 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
820 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
821 ; CI-NEXT: s_waitcnt lgkmcnt(0)
822 ; CI-NEXT: v_mov_b32_e32 v1, s3
823 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
824 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
825 ; CI-NEXT: flat_load_dword v3, v[0:1]
826 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
827 ; CI-NEXT: v_mov_b32_e32 v1, s1
828 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
829 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
830 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
831 ; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2
832 ; CI-NEXT: flat_store_dword v[0:1], v2
834 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
835 %tid.ext = sext i32 %tid to i64
836 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
837 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
838 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
839 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
840 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
844 define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
845 ; GFX9-LABEL: v_insertelement_v2f16_0:
847 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
848 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
849 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500
850 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
851 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
852 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
853 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
854 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
855 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
856 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
857 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
858 ; GFX9-NEXT: s_waitcnt vmcnt(0)
859 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
860 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
861 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
862 ; GFX9-NEXT: s_endpgm
864 ; VI-LABEL: v_insertelement_v2f16_0:
866 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
867 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
868 ; VI-NEXT: s_waitcnt lgkmcnt(0)
869 ; VI-NEXT: v_mov_b32_e32 v1, s3
870 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
871 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
872 ; VI-NEXT: flat_load_dword v3, v[0:1]
873 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
874 ; VI-NEXT: v_mov_b32_e32 v1, s1
875 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
876 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
877 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
878 ; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2
879 ; VI-NEXT: flat_store_dword v[0:1], v2
882 ; CI-LABEL: v_insertelement_v2f16_0:
884 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
885 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
886 ; CI-NEXT: s_waitcnt lgkmcnt(0)
887 ; CI-NEXT: v_mov_b32_e32 v1, s3
888 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
889 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
890 ; CI-NEXT: flat_load_dword v3, v[0:1]
891 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
892 ; CI-NEXT: v_mov_b32_e32 v1, s1
893 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
894 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
895 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
896 ; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2
897 ; CI-NEXT: flat_store_dword v[0:1], v2
899 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
900 %tid.ext = sext i32 %tid to i64
901 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
902 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
903 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
904 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
905 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
909 define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
910 ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
912 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
913 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
914 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
915 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
916 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
917 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
918 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
919 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
920 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
921 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
922 ; GFX9-NEXT: s_waitcnt vmcnt(0)
923 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
924 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53
925 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
926 ; GFX9-NEXT: s_endpgm
928 ; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
930 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
931 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
932 ; VI-NEXT: s_waitcnt lgkmcnt(0)
933 ; VI-NEXT: v_mov_b32_e32 v1, s3
934 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
935 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
936 ; VI-NEXT: flat_load_dword v3, v[0:1]
937 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
938 ; VI-NEXT: v_mov_b32_e32 v1, s1
939 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
940 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
941 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
942 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
943 ; VI-NEXT: flat_store_dword v[0:1], v2
946 ; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
948 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
949 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
950 ; CI-NEXT: s_waitcnt lgkmcnt(0)
951 ; CI-NEXT: v_mov_b32_e32 v1, s3
952 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
953 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
954 ; CI-NEXT: flat_load_dword v3, v[0:1]
955 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
956 ; CI-NEXT: v_mov_b32_e32 v1, s1
957 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
958 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959 ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
960 ; CI-NEXT: v_or_b32_e32 v2, 53, v2
961 ; CI-NEXT: flat_store_dword v[0:1], v2
963 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
964 %tid.ext = sext i32 %tid to i64
965 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
966 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
967 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
968 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
969 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
973 define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
974 ; GFX9-LABEL: v_insertelement_v2f16_1:
976 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
977 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
978 ; GFX9-NEXT: s_movk_i32 s4, 0x4500
979 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
980 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
981 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
982 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
983 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
984 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
985 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
986 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
987 ; GFX9-NEXT: s_waitcnt vmcnt(0)
988 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
989 ; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
990 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
991 ; GFX9-NEXT: s_endpgm
993 ; VI-LABEL: v_insertelement_v2f16_1:
995 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
996 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
997 ; VI-NEXT: v_mov_b32_e32 v3, 0x45000000
998 ; VI-NEXT: s_waitcnt lgkmcnt(0)
999 ; VI-NEXT: v_mov_b32_e32 v1, s3
1000 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1001 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1002 ; VI-NEXT: flat_load_dword v4, v[0:1]
1003 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1004 ; VI-NEXT: v_mov_b32_e32 v1, s1
1005 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1006 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1007 ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1008 ; VI-NEXT: flat_store_dword v[0:1], v2
1011 ; CI-LABEL: v_insertelement_v2f16_1:
1013 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1014 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1015 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1016 ; CI-NEXT: v_mov_b32_e32 v1, s3
1017 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1018 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1019 ; CI-NEXT: flat_load_dword v3, v[0:1]
1020 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1021 ; CI-NEXT: v_mov_b32_e32 v1, s1
1022 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1023 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1024 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1025 ; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2
1026 ; CI-NEXT: flat_store_dword v[0:1], v2
1028 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1029 %tid.ext = sext i32 %tid to i64
1030 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1031 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1032 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1033 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
1034 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1038 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1039 ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
1041 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1042 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1043 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1044 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1045 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1046 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1047 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1048 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1049 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1050 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1052 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3
1053 ; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2
1054 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1055 ; GFX9-NEXT: s_endpgm
1057 ; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
1059 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1060 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1061 ; VI-NEXT: v_mov_b32_e32 v3, 0x230000
1062 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1063 ; VI-NEXT: v_mov_b32_e32 v1, s3
1064 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1065 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1066 ; VI-NEXT: flat_load_dword v4, v[0:1]
1067 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1068 ; VI-NEXT: v_mov_b32_e32 v1, s1
1069 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1070 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1071 ; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1072 ; VI-NEXT: flat_store_dword v[0:1], v2
1075 ; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1077 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1078 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1079 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1080 ; CI-NEXT: v_mov_b32_e32 v1, s3
1081 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1082 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1083 ; CI-NEXT: flat_load_dword v3, v[0:1]
1084 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1085 ; CI-NEXT: v_mov_b32_e32 v1, s1
1086 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1087 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1088 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3
1089 ; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2
1090 ; CI-NEXT: flat_store_dword v[0:1], v2
1092 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1093 %tid.ext = sext i32 %tid to i64
1094 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1095 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1096 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1097 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1098 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1102 ; FIXME: Enable for others when argument load not split
1103 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1104 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1106 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1107 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
1108 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1109 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1110 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1111 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1112 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
1113 ; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
1114 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1115 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4
1116 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0
1117 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1118 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
1119 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1120 ; GFX9-NEXT: s_endpgm
1122 ; VI-LABEL: s_insertelement_v2i16_dynamic:
1124 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1125 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
1126 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1127 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1128 ; VI-NEXT: v_mov_b32_e32 v0, s0
1129 ; VI-NEXT: v_mov_b32_e32 v1, s1
1130 ; VI-NEXT: s_load_dword s0, s[4:5], 0x0
1131 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0
1132 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1133 ; VI-NEXT: s_lshl_b32 s0, s0, 4
1134 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1135 ; VI-NEXT: v_mov_b32_e32 v3, s1
1136 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1137 ; VI-NEXT: flat_store_dword v[0:1], v2
1140 ; CI-LABEL: s_insertelement_v2i16_dynamic:
1142 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1143 ; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
1144 ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
1145 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1146 ; CI-NEXT: v_mov_b32_e32 v0, s0
1147 ; CI-NEXT: v_mov_b32_e32 v1, s1
1148 ; CI-NEXT: s_load_dword s0, s[4:5], 0x0
1149 ; CI-NEXT: s_load_dword s1, s[2:3], 0x0
1150 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1151 ; CI-NEXT: s_lshl_b32 s0, s0, 4
1152 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
1153 ; CI-NEXT: v_mov_b32_e32 v3, s1
1154 ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
1155 ; CI-NEXT: flat_store_dword v[0:1], v2
1157 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1158 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1159 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1160 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1164 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1165 ; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1167 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1168 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1169 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
1170 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7
1171 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1172 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1173 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1174 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1175 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1176 ; GFX9-NEXT: s_lshl_b32 s2, s4, 4
1177 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1178 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1179 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2
1180 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1181 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1182 ; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4
1183 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1184 ; GFX9-NEXT: s_endpgm
1186 ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1189 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1190 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1191 ; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7
1192 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1193 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1194 ; VI-NEXT: v_mov_b32_e32 v1, s3
1195 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1196 ; VI-NEXT: flat_load_dword v4, v[0:1]
1197 ; VI-NEXT: s_lshl_b32 s2, s4, 4
1198 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1199 ; VI-NEXT: v_mov_b32_e32 v1, s1
1200 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s2
1201 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1202 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1203 ; VI-NEXT: v_bfi_b32 v2, s0, v3, v4
1204 ; VI-NEXT: flat_store_dword v[0:1], v2
1207 ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1209 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1210 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1211 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1212 ; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7
1213 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1214 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1215 ; CI-NEXT: v_mov_b32_e32 v1, s3
1216 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1217 ; CI-NEXT: flat_load_dword v4, v[0:1]
1218 ; CI-NEXT: s_lshl_b32 s2, s4, 4
1219 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1220 ; CI-NEXT: v_mov_b32_e32 v1, s1
1221 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s2
1222 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1223 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1224 ; CI-NEXT: v_bfi_b32 v2, s0, v3, v4
1225 ; CI-NEXT: flat_store_dword v[0:1], v2
1227 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1228 %tid.ext = sext i32 %tid to i64
1229 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1230 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1231 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1232 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1233 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1237 define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1238 ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1240 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1241 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
1242 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1243 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1244 ; GFX9-NEXT: s_mov_b32 s6, 0xffff
1245 ; GFX9-NEXT: s_mov_b32 s7, 0x12341234
1246 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1247 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1248 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
1249 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1250 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1251 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1252 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
1253 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1254 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
1255 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1256 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1257 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1258 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1259 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6
1260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1261 ; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3
1262 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1263 ; GFX9-NEXT: s_endpgm
1265 ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1268 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
1269 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1271 ; VI-NEXT: s_mov_b32 s6, 0xffff
1272 ; VI-NEXT: s_mov_b32 s7, 0x12341234
1273 ; VI-NEXT: v_mov_b32_e32 v3, s3
1274 ; VI-NEXT: v_mov_b32_e32 v1, s5
1275 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
1276 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1277 ; VI-NEXT: flat_load_dword v4, v[0:1]
1278 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1279 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1280 ; VI-NEXT: flat_load_dword v3, v[0:1]
1281 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
1282 ; VI-NEXT: v_mov_b32_e32 v1, s1
1283 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1284 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
1285 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1286 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6
1287 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1288 ; VI-NEXT: v_bfi_b32 v2, v2, s7, v3
1289 ; VI-NEXT: flat_store_dword v[0:1], v2
1292 ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1294 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1295 ; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
1296 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1297 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1298 ; CI-NEXT: s_mov_b32 s6, 0x12341234
1299 ; CI-NEXT: v_mov_b32_e32 v3, s3
1300 ; CI-NEXT: v_mov_b32_e32 v1, s5
1301 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
1302 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1303 ; CI-NEXT: flat_load_dword v4, v[0:1]
1304 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1305 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1306 ; CI-NEXT: flat_load_dword v3, v[0:1]
1307 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
1308 ; CI-NEXT: v_mov_b32_e32 v1, s1
1309 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1310 ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
1311 ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
1312 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
1313 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1314 ; CI-NEXT: v_bfi_b32 v2, v2, s6, v3
1315 ; CI-NEXT: flat_store_dword v[0:1], v2
1317 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1318 %tid.ext = sext i32 %tid to i64
1319 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1320 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1321 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1322 %idx = load i32, i32 addrspace(1)* %idx.gep
1323 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1324 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1325 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1329 define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1330 ; GFX9-LABEL: v_insertelement_v4f16_0:
1332 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1333 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1334 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
1335 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
1336 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1337 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1338 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1339 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1340 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1341 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1342 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1343 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1344 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1345 ; GFX9-NEXT: v_bfi_b32 v0, v4, s4, v0
1346 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1347 ; GFX9-NEXT: s_endpgm
1349 ; VI-LABEL: v_insertelement_v4f16_0:
1351 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1352 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1353 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1354 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1355 ; VI-NEXT: v_mov_b32_e32 v1, s3
1356 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1357 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1358 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1359 ; VI-NEXT: v_mov_b32_e32 v3, s1
1360 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1361 ; VI-NEXT: s_and_b32 s1, s4, 0xffff
1362 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1363 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1364 ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1365 ; VI-NEXT: v_or_b32_e32 v0, s1, v0
1366 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1369 ; CI-LABEL: v_insertelement_v4f16_0:
1371 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1372 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1373 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1374 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1375 ; CI-NEXT: v_mov_b32_e32 v1, s3
1376 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1377 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1378 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1379 ; CI-NEXT: v_mov_b32_e32 v3, s1
1380 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1381 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
1382 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1383 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1384 ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1385 ; CI-NEXT: v_or_b32_e32 v0, s1, v0
1386 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1388 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1389 %tid.ext = sext i32 %tid to i64
1390 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1391 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1392 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1393 %val.trunc = trunc i32 %val to i16
1394 %val.cvt = bitcast i16 %val.trunc to half
1395 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1396 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1400 define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1401 ; GFX9-LABEL: v_insertelement_v4f16_1:
1403 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1404 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1405 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
1406 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1407 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1408 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1409 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1410 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1411 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1412 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1413 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1414 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1415 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
1416 ; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0
1417 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1418 ; GFX9-NEXT: s_endpgm
1420 ; VI-LABEL: v_insertelement_v4f16_1:
1422 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1423 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1424 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1425 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1426 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1427 ; VI-NEXT: v_mov_b32_e32 v1, s3
1428 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1429 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1430 ; VI-NEXT: s_lshl_b32 s2, s4, 16
1431 ; VI-NEXT: v_mov_b32_e32 v4, s2
1432 ; VI-NEXT: v_mov_b32_e32 v3, s1
1433 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1434 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1435 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1436 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1437 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1440 ; CI-LABEL: v_insertelement_v4f16_1:
1442 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1443 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1444 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1445 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1446 ; CI-NEXT: v_mov_b32_e32 v1, s3
1447 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1448 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1449 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1450 ; CI-NEXT: v_mov_b32_e32 v3, s1
1451 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1452 ; CI-NEXT: s_lshl_b32 s1, s4, 16
1453 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1454 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1455 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1456 ; CI-NEXT: v_or_b32_e32 v0, s1, v0
1457 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1459 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1460 %tid.ext = sext i32 %tid to i64
1461 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1462 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1463 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1464 %val.trunc = trunc i32 %val to i16
1465 %val.cvt = bitcast i16 %val.trunc to half
1466 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1467 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1471 define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1472 ; GFX9-LABEL: v_insertelement_v4f16_2:
1474 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1475 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1476 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
1477 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
1478 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1479 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1480 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1481 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1482 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1483 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1484 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1485 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1486 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1487 ; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1
1488 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1489 ; GFX9-NEXT: s_endpgm
1491 ; VI-LABEL: v_insertelement_v4f16_2:
1493 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1494 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1495 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30
1496 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1497 ; VI-NEXT: v_mov_b32_e32 v1, s3
1498 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1499 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1500 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1501 ; VI-NEXT: v_mov_b32_e32 v3, s1
1502 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1503 ; VI-NEXT: s_and_b32 s1, s4, 0xffff
1504 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1505 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1506 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
1507 ; VI-NEXT: v_or_b32_e32 v1, s1, v1
1508 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1511 ; CI-LABEL: v_insertelement_v4f16_2:
1513 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1514 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1515 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc
1516 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1517 ; CI-NEXT: v_mov_b32_e32 v1, s3
1518 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1519 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1520 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1521 ; CI-NEXT: v_mov_b32_e32 v3, s1
1522 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1523 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
1524 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1525 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1526 ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
1527 ; CI-NEXT: v_or_b32_e32 v1, s1, v1
1528 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1530 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1531 %tid.ext = sext i32 %tid to i64
1532 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1533 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1534 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1535 %val.trunc = trunc i32 %val to i16
1536 %val.cvt = bitcast i16 %val.trunc to half
1537 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1538 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1542 define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1543 ; GFX9-LABEL: v_insertelement_v4f16_3:
1545 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1546 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1547 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
1548 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1549 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1550 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1551 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1552 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1553 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1554 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1555 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1556 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1557 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1558 ; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1
1559 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1560 ; GFX9-NEXT: s_endpgm
1562 ; VI-LABEL: v_insertelement_v4f16_3:
1564 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1565 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1566 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1567 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1568 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1569 ; VI-NEXT: v_mov_b32_e32 v1, s3
1570 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1571 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1572 ; VI-NEXT: s_lshl_b32 s2, s4, 16
1573 ; VI-NEXT: v_mov_b32_e32 v4, s2
1574 ; VI-NEXT: v_mov_b32_e32 v3, s1
1575 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1576 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1577 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1578 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1579 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1582 ; CI-LABEL: v_insertelement_v4f16_3:
1584 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1585 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1586 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1587 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1588 ; CI-NEXT: v_mov_b32_e32 v1, s3
1589 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1590 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1591 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1592 ; CI-NEXT: v_mov_b32_e32 v3, s1
1593 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1594 ; CI-NEXT: s_lshl_b32 s1, s4, 16
1595 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1596 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1597 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1598 ; CI-NEXT: v_or_b32_e32 v1, s1, v1
1599 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1601 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1602 %tid.ext = sext i32 %tid to i64
1603 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1604 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1605 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1606 %val.trunc = trunc i32 %val to i16
1607 %val.cvt = bitcast i16 %val.trunc to half
1608 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1609 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1613 define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1614 ; GFX9-LABEL: v_insertelement_v4i16_2:
1616 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1617 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1618 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
1619 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
1620 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1621 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1622 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1623 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1624 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1625 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1626 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1627 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1628 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1629 ; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1
1630 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1631 ; GFX9-NEXT: s_endpgm
1633 ; VI-LABEL: v_insertelement_v4i16_2:
1635 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1636 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1637 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1638 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1639 ; VI-NEXT: v_mov_b32_e32 v1, s3
1640 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1641 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1642 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1643 ; VI-NEXT: v_mov_b32_e32 v3, s1
1644 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1645 ; VI-NEXT: s_and_b32 s1, s4, 0xffff
1646 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1647 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1648 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
1649 ; VI-NEXT: v_or_b32_e32 v1, s1, v1
1650 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1653 ; CI-LABEL: v_insertelement_v4i16_2:
1655 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1656 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1657 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
1658 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1659 ; CI-NEXT: v_mov_b32_e32 v1, s3
1660 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1661 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1662 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1663 ; CI-NEXT: v_mov_b32_e32 v3, s1
1664 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1665 ; CI-NEXT: s_and_b32 s1, s4, 0xffff
1666 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1667 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1668 ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
1669 ; CI-NEXT: v_or_b32_e32 v1, s1, v1
1670 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1672 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1673 %tid.ext = sext i32 %tid to i64
1674 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1675 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1676 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1677 %val.trunc = trunc i32 %val to i16
1678 %val.cvt = bitcast i16 %val.trunc to i16
1679 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1680 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1684 ; FIXME: Better code on CI?
1685 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1686 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1688 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1689 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1690 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1691 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
1692 ; GFX9-NEXT: s_mov_b32 s5, 0
1693 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1694 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1695 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1696 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1697 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1698 ; GFX9-NEXT: s_mov_b32 s4, 0xffff
1699 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1700 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1701 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6
1702 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1703 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1704 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4
1705 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5]
1706 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1707 ; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1
1708 ; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0
1709 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1710 ; GFX9-NEXT: s_endpgm
1712 ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1714 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1715 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1716 ; VI-NEXT: flat_load_dword v4, v[0:1]
1717 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1718 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
1719 ; VI-NEXT: s_mov_b32 s4, 0xffff
1720 ; VI-NEXT: v_mov_b32_e32 v1, s3
1721 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1722 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1723 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1724 ; VI-NEXT: v_mov_b32_e32 v3, s1
1725 ; VI-NEXT: s_mov_b32 s5, 0
1726 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1727 ; VI-NEXT: s_and_b32 s1, s6, s4
1728 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1729 ; VI-NEXT: s_lshl_b32 s0, s1, 16
1730 ; VI-NEXT: s_or_b32 s0, s1, s0
1731 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1732 ; VI-NEXT: s_waitcnt vmcnt(1)
1733 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
1734 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5]
1735 ; VI-NEXT: s_waitcnt vmcnt(0)
1736 ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
1737 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
1738 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1741 ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1743 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1744 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1745 ; CI-NEXT: flat_load_dword v4, v[0:1]
1746 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1747 ; CI-NEXT: s_load_dword s6, s[4:5], 0x4
1748 ; CI-NEXT: s_mov_b32 s4, 0xffff
1749 ; CI-NEXT: v_mov_b32_e32 v1, s3
1750 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1751 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1752 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1753 ; CI-NEXT: s_mov_b32 s5, 0
1754 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1755 ; CI-NEXT: s_lshl_b32 s2, s6, 16
1756 ; CI-NEXT: s_and_b32 s3, s6, s4
1757 ; CI-NEXT: v_mov_b32_e32 v3, s1
1758 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1759 ; CI-NEXT: s_or_b32 s1, s3, s2
1760 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1761 ; CI-NEXT: s_waitcnt vmcnt(1)
1762 ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
1763 ; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4
1764 ; CI-NEXT: s_waitcnt vmcnt(0)
1765 ; CI-NEXT: v_bfi_b32 v1, v5, s1, v1
1766 ; CI-NEXT: v_bfi_b32 v0, v4, s1, v0
1767 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1769 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1770 %tid.ext = sext i32 %tid to i64
1771 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1772 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1773 %idx.val = load volatile i32, i32 addrspace(1)* undef
1774 %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1775 %val.trunc = trunc i32 %val to i16
1776 %val.cvt = bitcast i16 %val.trunc to i16
1777 %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
1778 store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1782 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
1783 ; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1785 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1786 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1787 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1788 ; GFX9-NEXT: s_mov_b32 s7, 0
1789 ; GFX9-NEXT: s_mov_b32 s6, 0xffff
1790 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1791 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1792 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
1793 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1794 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1795 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4
1796 ; GFX9-NEXT: s_lshl_b32 s2, s5, 4
1797 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1798 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
1799 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
1800 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
1801 ; GFX9-NEXT: v_mov_b32_e32 v5, s3
1802 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1803 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1804 ; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1
1805 ; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0
1806 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1807 ; GFX9-NEXT: s_endpgm
1809 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1811 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1812 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1813 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1814 ; VI-NEXT: s_mov_b32 s6, 0xffff
1815 ; VI-NEXT: s_mov_b32 s7, 0
1816 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1817 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1818 ; VI-NEXT: v_mov_b32_e32 v1, s3
1819 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1820 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1821 ; VI-NEXT: s_and_b32 s2, s4, s6
1822 ; VI-NEXT: s_lshl_b32 s3, s2, 16
1823 ; VI-NEXT: s_or_b32 s2, s2, s3
1824 ; VI-NEXT: s_lshl_b32 s4, s5, 4
1825 ; VI-NEXT: v_mov_b32_e32 v3, s1
1826 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
1827 ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4
1828 ; VI-NEXT: v_mov_b32_e32 v4, s2
1829 ; VI-NEXT: v_mov_b32_e32 v5, s2
1830 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1831 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1832 ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1
1833 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0
1834 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1837 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1839 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1840 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1841 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1842 ; CI-NEXT: s_mov_b32 s6, 0xffff
1843 ; CI-NEXT: s_mov_b32 s7, 0
1844 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1845 ; CI-NEXT: v_mov_b32_e32 v1, s3
1846 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
1847 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1848 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1849 ; CI-NEXT: s_and_b32 s2, s4, s6
1850 ; CI-NEXT: s_lshl_b32 s3, s4, 16
1851 ; CI-NEXT: s_or_b32 s2, s2, s3
1852 ; CI-NEXT: s_lshl_b32 s4, s5, 4
1853 ; CI-NEXT: v_mov_b32_e32 v3, s1
1854 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
1855 ; CI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4
1856 ; CI-NEXT: v_mov_b32_e32 v4, s2
1857 ; CI-NEXT: v_mov_b32_e32 v5, s2
1858 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1859 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1860 ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1
1861 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0
1862 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1864 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1865 %tid.ext = sext i32 %tid to i64
1866 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1867 %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1868 %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1869 %val.trunc = trunc i32 %val to i16
1870 %val.cvt = bitcast i16 %val.trunc to half
1871 %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
1872 store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1876 declare i32 @llvm.amdgcn.workitem.id.x() #1
1878 attributes #0 = { nounwind }
1879 attributes #1 = { nounwind readnone }