1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
4 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
5 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
7 define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
8 ; SI-LABEL: s_insertelement_v2bf16_0:
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
11 ; SI-NEXT: s_waitcnt lgkmcnt(0)
12 ; SI-NEXT: s_load_dword s4, s[2:3], 0x0
13 ; SI-NEXT: s_mov_b32 s3, 0x100f000
14 ; SI-NEXT: s_mov_b32 s2, -1
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff0000
17 ; SI-NEXT: s_or_b32 s4, s4, 0x40a0
18 ; SI-NEXT: v_mov_b32_e32 v0, s4
19 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
22 ; VI-LABEL: s_insertelement_v2bf16_0:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
27 ; VI-NEXT: v_mov_b32_e32 v0, s0
28 ; VI-NEXT: v_mov_b32_e32 v1, s1
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: s_and_b32 s0, s2, 0xffff0000
31 ; VI-NEXT: s_or_b32 s0, s0, 0x40a0
32 ; VI-NEXT: v_mov_b32_e32 v2, s0
33 ; VI-NEXT: flat_store_dword v[0:1], v2
36 ; GFX900-LABEL: s_insertelement_v2bf16_0:
38 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
39 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
40 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0
42 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX900-NEXT: s_lshr_b32 s2, s2, 16
44 ; GFX900-NEXT: s_pack_ll_b32_b16 s2, 0x40a0, s2
45 ; GFX900-NEXT: v_mov_b32_e32 v1, s2
46 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
47 ; GFX900-NEXT: s_endpgm
49 ; GFX940-LABEL: s_insertelement_v2bf16_0:
51 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
52 ; GFX940-NEXT: v_mov_b32_e32 v0, 0
53 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0
55 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX940-NEXT: s_lshr_b32 s2, s2, 16
57 ; GFX940-NEXT: s_pack_ll_b32_b16 s2, 0x40a0, s2
58 ; GFX940-NEXT: v_mov_b32_e32 v1, s2
59 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
60 ; GFX940-NEXT: s_endpgm
61 %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
62 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
63 store <2 x bfloat> %vecins, ptr addrspace(1) %out
67 define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
68 ; SI-LABEL: s_insertelement_v2bf16_1:
70 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
71 ; SI-NEXT: s_waitcnt lgkmcnt(0)
72 ; SI-NEXT: s_load_dword s4, s[2:3], 0x0
73 ; SI-NEXT: s_mov_b32 s3, 0x100f000
74 ; SI-NEXT: s_mov_b32 s2, -1
75 ; SI-NEXT: s_waitcnt lgkmcnt(0)
76 ; SI-NEXT: s_and_b32 s4, s4, 0xffff
77 ; SI-NEXT: s_or_b32 s4, s4, 0x40a00000
78 ; SI-NEXT: v_mov_b32_e32 v0, s4
79 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
82 ; VI-LABEL: s_insertelement_v2bf16_1:
84 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
85 ; VI-NEXT: s_waitcnt lgkmcnt(0)
86 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0
87 ; VI-NEXT: v_mov_b32_e32 v0, s0
88 ; VI-NEXT: v_mov_b32_e32 v1, s1
89 ; VI-NEXT: s_waitcnt lgkmcnt(0)
90 ; VI-NEXT: s_and_b32 s0, s2, 0xffff
91 ; VI-NEXT: s_or_b32 s0, s0, 0x40a00000
92 ; VI-NEXT: v_mov_b32_e32 v2, s0
93 ; VI-NEXT: flat_store_dword v[0:1], v2
96 ; GFX900-LABEL: s_insertelement_v2bf16_1:
98 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
99 ; GFX900-NEXT: v_mov_b32_e32 v0, 0
100 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX900-NEXT: s_load_dword s2, s[2:3], 0x0
102 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
103 ; GFX900-NEXT: s_pack_ll_b32_b16 s2, s2, 0x40a0
104 ; GFX900-NEXT: v_mov_b32_e32 v1, s2
105 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
106 ; GFX900-NEXT: s_endpgm
108 ; GFX940-LABEL: s_insertelement_v2bf16_1:
110 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
111 ; GFX940-NEXT: v_mov_b32_e32 v0, 0
112 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX940-NEXT: s_load_dword s2, s[2:3], 0x0
114 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX940-NEXT: s_pack_ll_b32_b16 s2, s2, 0x40a0
116 ; GFX940-NEXT: v_mov_b32_e32 v1, s2
117 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
118 ; GFX940-NEXT: s_endpgm
119 %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
120 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
121 store <2 x bfloat> %vecins, ptr addrspace(1) %out
125 define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
126 ; SI-LABEL: v_insertelement_v2bf16_0:
128 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
129 ; SI-NEXT: s_mov_b32 s7, 0x100f000
130 ; SI-NEXT: s_mov_b32 s6, 0
131 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
132 ; SI-NEXT: v_mov_b32_e32 v1, 0
133 ; SI-NEXT: s_waitcnt lgkmcnt(0)
134 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
135 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
136 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
137 ; SI-NEXT: s_waitcnt vmcnt(0)
138 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
139 ; SI-NEXT: v_or_b32_e32 v2, 0x40a0, v2
140 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
143 ; VI-LABEL: v_insertelement_v2bf16_0:
145 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
146 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
148 ; VI-NEXT: v_mov_b32_e32 v1, s3
149 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
150 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
151 ; VI-NEXT: flat_load_dword v3, v[0:1]
152 ; VI-NEXT: v_mov_b32_e32 v1, s1
153 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
154 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
155 ; VI-NEXT: s_waitcnt vmcnt(0)
156 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
157 ; VI-NEXT: v_or_b32_e32 v2, 0x40a0, v2
158 ; VI-NEXT: flat_store_dword v[0:1], v2
161 ; GFX900-LABEL: v_insertelement_v2bf16_0:
163 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
164 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0
165 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x40a0
166 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX900-NEXT: global_load_dword v1, v0, s[2:3]
168 ; GFX900-NEXT: s_mov_b32 s2, 0xffff
169 ; GFX900-NEXT: s_waitcnt vmcnt(0)
170 ; GFX900-NEXT: v_bfi_b32 v1, s2, v2, v1
171 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
172 ; GFX900-NEXT: s_endpgm
174 ; GFX940-LABEL: v_insertelement_v2bf16_0:
176 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
177 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
178 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
179 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x40a0
180 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX940-NEXT: global_load_dword v1, v0, s[2:3]
182 ; GFX940-NEXT: s_mov_b32 s2, 0xffff
183 ; GFX940-NEXT: s_waitcnt vmcnt(0)
184 ; GFX940-NEXT: v_bfi_b32 v1, s2, v2, v1
185 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
186 ; GFX940-NEXT: s_endpgm
187 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
188 %tid.ext = sext i32 %tid to i64
189 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
190 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
191 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
192 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
193 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
197 define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
198 ; SI-LABEL: v_insertelement_v2bf16_0_inlineimm:
200 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
201 ; SI-NEXT: s_mov_b32 s7, 0x100f000
202 ; SI-NEXT: s_mov_b32 s6, 0
203 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
204 ; SI-NEXT: v_mov_b32_e32 v1, 0
205 ; SI-NEXT: s_waitcnt lgkmcnt(0)
206 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
207 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
208 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
209 ; SI-NEXT: s_waitcnt vmcnt(0)
210 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
211 ; SI-NEXT: v_or_b32_e32 v2, 53, v2
212 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
215 ; VI-LABEL: v_insertelement_v2bf16_0_inlineimm:
217 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
218 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
219 ; VI-NEXT: s_waitcnt lgkmcnt(0)
220 ; VI-NEXT: v_mov_b32_e32 v1, s3
221 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
222 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
223 ; VI-NEXT: flat_load_dword v3, v[0:1]
224 ; VI-NEXT: v_mov_b32_e32 v1, s1
225 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
226 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
227 ; VI-NEXT: s_waitcnt vmcnt(0)
228 ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
229 ; VI-NEXT: v_or_b32_e32 v2, 53, v2
230 ; VI-NEXT: flat_store_dword v[0:1], v2
233 ; GFX900-LABEL: v_insertelement_v2bf16_0_inlineimm:
235 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
236 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0
237 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
238 ; GFX900-NEXT: global_load_dword v1, v0, s[2:3]
239 ; GFX900-NEXT: s_mov_b32 s2, 0xffff
240 ; GFX900-NEXT: s_waitcnt vmcnt(0)
241 ; GFX900-NEXT: v_bfi_b32 v1, s2, 53, v1
242 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
243 ; GFX900-NEXT: s_endpgm
245 ; GFX940-LABEL: v_insertelement_v2bf16_0_inlineimm:
247 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
248 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
249 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
250 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX940-NEXT: global_load_dword v1, v0, s[2:3]
252 ; GFX940-NEXT: s_mov_b32 s2, 0xffff
253 ; GFX940-NEXT: s_waitcnt vmcnt(0)
254 ; GFX940-NEXT: v_bfi_b32 v1, s2, 53, v1
255 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
256 ; GFX940-NEXT: s_endpgm
257 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
258 %tid.ext = sext i32 %tid to i64
259 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
260 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
261 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
262 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0035, i32 0
263 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
267 define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
268 ; SI-LABEL: v_insertelement_v2bf16_1:
270 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
271 ; SI-NEXT: s_mov_b32 s7, 0x100f000
272 ; SI-NEXT: s_mov_b32 s6, 0
273 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
274 ; SI-NEXT: v_mov_b32_e32 v1, 0
275 ; SI-NEXT: s_waitcnt lgkmcnt(0)
276 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
277 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
278 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
279 ; SI-NEXT: s_waitcnt vmcnt(0)
280 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
281 ; SI-NEXT: v_or_b32_e32 v2, 0x40a00000, v2
282 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
285 ; VI-LABEL: v_insertelement_v2bf16_1:
287 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
288 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
289 ; VI-NEXT: s_waitcnt lgkmcnt(0)
290 ; VI-NEXT: v_mov_b32_e32 v1, s3
291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
293 ; VI-NEXT: flat_load_dword v3, v[0:1]
294 ; VI-NEXT: v_mov_b32_e32 v1, s1
295 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
296 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
297 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
298 ; VI-NEXT: s_waitcnt vmcnt(0)
299 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
300 ; VI-NEXT: flat_store_dword v[0:1], v2
303 ; GFX900-LABEL: v_insertelement_v2bf16_1:
305 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
306 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0
307 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100
308 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
309 ; GFX900-NEXT: global_load_dword v1, v0, s[2:3]
310 ; GFX900-NEXT: s_movk_i32 s2, 0x40a0
311 ; GFX900-NEXT: s_waitcnt vmcnt(0)
312 ; GFX900-NEXT: v_perm_b32 v1, s2, v1, v2
313 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
314 ; GFX900-NEXT: s_endpgm
316 ; GFX940-LABEL: v_insertelement_v2bf16_1:
318 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
319 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
320 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
321 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100
322 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX940-NEXT: global_load_dword v1, v0, s[2:3]
324 ; GFX940-NEXT: s_movk_i32 s2, 0x40a0
325 ; GFX940-NEXT: s_waitcnt vmcnt(0)
326 ; GFX940-NEXT: v_perm_b32 v1, s2, v1, v2
327 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
328 ; GFX940-NEXT: s_endpgm
329 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
330 %tid.ext = sext i32 %tid to i64
331 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
332 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
333 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
334 %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
335 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
339 define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
340 ; SI-LABEL: v_insertelement_v2bf16_1_inlineimm:
342 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
343 ; SI-NEXT: s_mov_b32 s7, 0x100f000
344 ; SI-NEXT: s_mov_b32 s6, 0
345 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
346 ; SI-NEXT: v_mov_b32_e32 v1, 0
347 ; SI-NEXT: s_waitcnt lgkmcnt(0)
348 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
349 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
350 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
351 ; SI-NEXT: s_waitcnt vmcnt(0)
352 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
353 ; SI-NEXT: v_or_b32_e32 v2, 0x230000, v2
354 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
357 ; VI-LABEL: v_insertelement_v2bf16_1_inlineimm:
359 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
360 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
362 ; VI-NEXT: v_mov_b32_e32 v1, s3
363 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
364 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
365 ; VI-NEXT: flat_load_dword v3, v[0:1]
366 ; VI-NEXT: v_mov_b32_e32 v1, s1
367 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
368 ; VI-NEXT: v_mov_b32_e32 v2, 0x230000
369 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
370 ; VI-NEXT: s_waitcnt vmcnt(0)
371 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
372 ; VI-NEXT: flat_store_dword v[0:1], v2
375 ; GFX900-LABEL: v_insertelement_v2bf16_1_inlineimm:
377 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
378 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0
379 ; GFX900-NEXT: v_mov_b32_e32 v2, 0x5040100
380 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX900-NEXT: global_load_dword v1, v0, s[2:3]
382 ; GFX900-NEXT: s_waitcnt vmcnt(0)
383 ; GFX900-NEXT: v_perm_b32 v1, 35, v1, v2
384 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
385 ; GFX900-NEXT: s_endpgm
387 ; GFX940-LABEL: v_insertelement_v2bf16_1_inlineimm:
389 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
390 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
391 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
392 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x5040100
393 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
394 ; GFX940-NEXT: global_load_dword v1, v0, s[2:3]
395 ; GFX940-NEXT: s_waitcnt vmcnt(0)
396 ; GFX940-NEXT: v_perm_b32 v1, 35, v1, v2
397 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
398 ; GFX940-NEXT: s_endpgm
399 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
400 %tid.ext = sext i32 %tid to i64
401 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
402 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
403 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
404 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR0023, i32 1
405 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
409 define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %idx.ptr) #0 {
410 ; SI-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
412 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
413 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
414 ; SI-NEXT: s_mov_b32 s11, 0x100f000
415 ; SI-NEXT: s_mov_b32 s10, 0
416 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
417 ; SI-NEXT: v_mov_b32_e32 v1, 0
418 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
419 ; SI-NEXT: s_waitcnt lgkmcnt(0)
420 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
421 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
422 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
423 ; SI-NEXT: s_mov_b32 s4, 0x12341234
424 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
425 ; SI-NEXT: s_waitcnt vmcnt(1)
426 ; SI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
427 ; SI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
428 ; SI-NEXT: s_waitcnt vmcnt(0)
429 ; SI-NEXT: v_bfi_b32 v2, v2, s4, v3
430 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
433 ; VI-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
435 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
436 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
437 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: v_mov_b32_e32 v3, s3
440 ; VI-NEXT: v_mov_b32_e32 v1, s5
441 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
442 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
443 ; VI-NEXT: flat_load_dword v4, v[0:1]
444 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
445 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
446 ; VI-NEXT: flat_load_dword v3, v[0:1]
447 ; VI-NEXT: s_mov_b32 s2, 0xffff
448 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
449 ; VI-NEXT: v_mov_b32_e32 v1, s1
450 ; VI-NEXT: s_mov_b32 s0, 0x12341234
451 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
452 ; VI-NEXT: s_waitcnt vmcnt(1)
453 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4
454 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
455 ; VI-NEXT: s_waitcnt vmcnt(0)
456 ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
457 ; VI-NEXT: flat_store_dword v[0:1], v2
460 ; GFX900-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
462 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
463 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
464 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 2, v0
465 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
466 ; GFX900-NEXT: global_load_dword v1, v0, s[4:5]
467 ; GFX900-NEXT: global_load_dword v2, v0, s[2:3]
468 ; GFX900-NEXT: s_mov_b32 s2, 0xffff
469 ; GFX900-NEXT: s_waitcnt vmcnt(1)
470 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 4, v1
471 ; GFX900-NEXT: v_lshlrev_b32_e64 v1, v1, s2
472 ; GFX900-NEXT: s_mov_b32 s2, 0x12341234
473 ; GFX900-NEXT: s_waitcnt vmcnt(0)
474 ; GFX900-NEXT: v_bfi_b32 v1, v1, s2, v2
475 ; GFX900-NEXT: global_store_dword v0, v1, s[0:1]
476 ; GFX900-NEXT: s_endpgm
478 ; GFX940-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
480 ; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
481 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
482 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
483 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
484 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX940-NEXT: global_load_dword v1, v0, s[6:7]
486 ; GFX940-NEXT: global_load_dword v2, v0, s[2:3]
487 ; GFX940-NEXT: s_mov_b32 s2, 0xffff
488 ; GFX940-NEXT: s_waitcnt vmcnt(1)
489 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 4, v1
490 ; GFX940-NEXT: v_lshlrev_b32_e64 v1, v1, s2
491 ; GFX940-NEXT: s_mov_b32 s2, 0x12341234
492 ; GFX940-NEXT: s_waitcnt vmcnt(0)
493 ; GFX940-NEXT: v_bfi_b32 v1, v1, s2, v2
494 ; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
495 ; GFX940-NEXT: s_endpgm
496 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
497 %tid.ext = sext i32 %tid to i64
498 %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
499 %idx.gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
500 %out.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
501 %idx = load i32, ptr addrspace(1) %idx.gep
502 %vec = load <2 x bfloat>, ptr addrspace(1) %in.gep
503 %vecins = insertelement <2 x bfloat> %vec, bfloat 0xR1234, i32 %idx
504 store <2 x bfloat> %vecins, ptr addrspace(1) %out.gep
508 define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
509 ; SI-LABEL: v_insertelement_v4bf16_0:
511 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
512 ; SI-NEXT: s_mov_b32 s7, 0x100f000
513 ; SI-NEXT: s_mov_b32 s6, 0
514 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
515 ; SI-NEXT: v_mov_b32_e32 v1, 0
516 ; SI-NEXT: s_waitcnt lgkmcnt(0)
517 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
518 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
519 ; SI-NEXT: s_load_dword s8, s[8:9], 0xc
520 ; SI-NEXT: s_mov_b32 s4, 0xffff
521 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
522 ; SI-NEXT: s_waitcnt lgkmcnt(0)
523 ; SI-NEXT: v_mov_b32_e32 v4, s8
524 ; SI-NEXT: s_waitcnt vmcnt(0)
525 ; SI-NEXT: v_bfi_b32 v2, s4, v4, v2
526 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
529 ; VI-LABEL: v_insertelement_v4bf16_0:
531 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
532 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
533 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
534 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
535 ; VI-NEXT: s_waitcnt lgkmcnt(0)
536 ; VI-NEXT: v_mov_b32_e32 v1, s3
537 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
538 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
539 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
540 ; VI-NEXT: v_mov_b32_e32 v3, s1
541 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
542 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
543 ; VI-NEXT: s_waitcnt vmcnt(0)
544 ; VI-NEXT: v_perm_b32 v0, s4, v0, v4
545 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
548 ; GFX900-LABEL: v_insertelement_v4bf16_0:
550 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
551 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30
552 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
553 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
554 ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
555 ; GFX900-NEXT: s_mov_b32 s2, 0xffff
556 ; GFX900-NEXT: v_mov_b32_e32 v3, s4
557 ; GFX900-NEXT: s_waitcnt vmcnt(0)
558 ; GFX900-NEXT: v_bfi_b32 v0, s2, v3, v0
559 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
560 ; GFX900-NEXT: s_endpgm
562 ; GFX940-LABEL: v_insertelement_v4bf16_0:
564 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
565 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30
566 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
567 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0
568 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
570 ; GFX940-NEXT: s_mov_b32 s2, 0xffff
571 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
572 ; GFX940-NEXT: s_waitcnt vmcnt(0)
573 ; GFX940-NEXT: v_bfi_b32 v0, s2, v3, v0
574 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
575 ; GFX940-NEXT: s_endpgm
576 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
577 %tid.ext = sext i32 %tid to i64
578 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
579 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
580 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
581 %val.trunc = trunc i32 %val to i16
582 %val.cvt = bitcast i16 %val.trunc to bfloat
583 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 0
584 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
588 define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
589 ; SI-LABEL: v_insertelement_v4bf16_1:
591 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
592 ; SI-NEXT: s_mov_b32 s7, 0x100f000
593 ; SI-NEXT: s_mov_b32 s6, 0
594 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
595 ; SI-NEXT: v_mov_b32_e32 v1, 0
596 ; SI-NEXT: s_waitcnt lgkmcnt(0)
597 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
598 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
599 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
600 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
601 ; SI-NEXT: s_waitcnt lgkmcnt(0)
602 ; SI-NEXT: s_lshl_b32 s4, s8, 16
603 ; SI-NEXT: s_waitcnt vmcnt(0)
604 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
605 ; SI-NEXT: v_or_b32_e32 v2, s4, v2
606 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
609 ; VI-LABEL: v_insertelement_v4bf16_1:
611 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
612 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
613 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
614 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
615 ; VI-NEXT: s_waitcnt lgkmcnt(0)
616 ; VI-NEXT: v_mov_b32_e32 v1, s3
617 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
618 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
619 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
620 ; VI-NEXT: v_mov_b32_e32 v3, s1
621 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
622 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
623 ; VI-NEXT: s_waitcnt vmcnt(0)
624 ; VI-NEXT: v_perm_b32 v0, v0, s4, v4
625 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
628 ; GFX900-LABEL: v_insertelement_v4bf16_1:
630 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
631 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10
632 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
633 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100
634 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
635 ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
636 ; GFX900-NEXT: s_waitcnt vmcnt(0)
637 ; GFX900-NEXT: v_perm_b32 v0, s4, v0, v3
638 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
639 ; GFX900-NEXT: s_endpgm
641 ; GFX940-LABEL: v_insertelement_v4bf16_1:
643 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
644 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10
645 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
646 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0
647 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100
648 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
650 ; GFX940-NEXT: s_waitcnt vmcnt(0)
651 ; GFX940-NEXT: v_perm_b32 v0, s6, v0, v3
652 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
653 ; GFX940-NEXT: s_endpgm
654 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
655 %tid.ext = sext i32 %tid to i64
656 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
657 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
658 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
659 %val.trunc = trunc i32 %val to i16
660 %val.cvt = bitcast i16 %val.trunc to bfloat
661 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 1
662 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
666 define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr addrspace(1) %in, [8 x i32], i32 %val) #0 {
667 ; SI-LABEL: v_insertelement_v4bf16_2:
669 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
670 ; SI-NEXT: s_mov_b32 s7, 0x100f000
671 ; SI-NEXT: s_mov_b32 s6, 0
672 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
673 ; SI-NEXT: v_mov_b32_e32 v1, 0
674 ; SI-NEXT: s_waitcnt lgkmcnt(0)
675 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
676 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
677 ; SI-NEXT: s_load_dword s8, s[8:9], 0xc
678 ; SI-NEXT: s_mov_b32 s4, 0xffff
679 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
680 ; SI-NEXT: s_waitcnt lgkmcnt(0)
681 ; SI-NEXT: v_mov_b32_e32 v4, s8
682 ; SI-NEXT: s_waitcnt vmcnt(0)
683 ; SI-NEXT: v_bfi_b32 v3, s4, v4, v3
684 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
687 ; VI-LABEL: v_insertelement_v4bf16_2:
689 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
690 ; VI-NEXT: s_load_dword s4, s[8:9], 0x30
691 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
692 ; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
693 ; VI-NEXT: s_waitcnt lgkmcnt(0)
694 ; VI-NEXT: v_mov_b32_e32 v1, s3
695 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
696 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
697 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
698 ; VI-NEXT: v_mov_b32_e32 v3, s1
699 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
700 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
701 ; VI-NEXT: s_waitcnt vmcnt(0)
702 ; VI-NEXT: v_perm_b32 v1, s4, v1, v4
703 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
706 ; GFX900-LABEL: v_insertelement_v4bf16_2:
708 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
709 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x30
710 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
711 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
712 ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
713 ; GFX900-NEXT: s_mov_b32 s2, 0xffff
714 ; GFX900-NEXT: v_mov_b32_e32 v3, s4
715 ; GFX900-NEXT: s_waitcnt vmcnt(0)
716 ; GFX900-NEXT: v_bfi_b32 v1, s2, v3, v1
717 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
718 ; GFX900-NEXT: s_endpgm
720 ; GFX940-LABEL: v_insertelement_v4bf16_2:
722 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
723 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x30
724 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
725 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0
726 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
727 ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
728 ; GFX940-NEXT: s_mov_b32 s2, 0xffff
729 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
730 ; GFX940-NEXT: s_waitcnt vmcnt(0)
731 ; GFX940-NEXT: v_bfi_b32 v1, s2, v3, v1
732 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
733 ; GFX940-NEXT: s_endpgm
734 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
735 %tid.ext = sext i32 %tid to i64
736 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
737 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
738 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
739 %val.trunc = trunc i32 %val to i16
740 %val.cvt = bitcast i16 %val.trunc to bfloat
741 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 2
742 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
746 define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) #0 {
747 ; SI-LABEL: v_insertelement_v4bf16_3:
749 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
750 ; SI-NEXT: s_mov_b32 s7, 0x100f000
751 ; SI-NEXT: s_mov_b32 s6, 0
752 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
753 ; SI-NEXT: v_mov_b32_e32 v1, 0
754 ; SI-NEXT: s_waitcnt lgkmcnt(0)
755 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
756 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
757 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
758 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
759 ; SI-NEXT: s_waitcnt lgkmcnt(0)
760 ; SI-NEXT: s_lshl_b32 s4, s8, 16
761 ; SI-NEXT: s_waitcnt vmcnt(0)
762 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
763 ; SI-NEXT: v_or_b32_e32 v3, s4, v3
764 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
767 ; VI-LABEL: v_insertelement_v4bf16_3:
769 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
770 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
771 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
772 ; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
773 ; VI-NEXT: s_waitcnt lgkmcnt(0)
774 ; VI-NEXT: v_mov_b32_e32 v1, s3
775 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
776 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
777 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
778 ; VI-NEXT: v_mov_b32_e32 v3, s1
779 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
780 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
781 ; VI-NEXT: s_waitcnt vmcnt(0)
782 ; VI-NEXT: v_perm_b32 v1, v1, s4, v4
783 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
786 ; GFX900-LABEL: v_insertelement_v4bf16_3:
788 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
789 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10
790 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
791 ; GFX900-NEXT: v_mov_b32_e32 v3, 0x5040100
792 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
793 ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
794 ; GFX900-NEXT: s_waitcnt vmcnt(0)
795 ; GFX900-NEXT: v_perm_b32 v1, s4, v1, v3
796 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
797 ; GFX900-NEXT: s_endpgm
799 ; GFX940-LABEL: v_insertelement_v4bf16_3:
801 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
802 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10
803 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
804 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0
805 ; GFX940-NEXT: v_mov_b32_e32 v3, 0x5040100
806 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
807 ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
808 ; GFX940-NEXT: s_waitcnt vmcnt(0)
809 ; GFX940-NEXT: v_perm_b32 v1, s6, v1, v3
810 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
811 ; GFX940-NEXT: s_endpgm
812 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
813 %tid.ext = sext i32 %tid to i64
814 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
815 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
816 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
817 %val.trunc = trunc i32 %val to i16
818 %val.cvt = bitcast i16 %val.trunc to bfloat
819 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 3
820 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
824 define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %idxval) #0 {
825 ; SI-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
827 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
828 ; SI-NEXT: s_mov_b32 s7, 0x100f000
829 ; SI-NEXT: s_mov_b32 s6, 0
830 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
831 ; SI-NEXT: v_mov_b32_e32 v1, 0
832 ; SI-NEXT: s_waitcnt lgkmcnt(0)
833 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
834 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
835 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4
836 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
837 ; SI-NEXT: s_waitcnt lgkmcnt(0)
838 ; SI-NEXT: s_lshl_b32 s4, s8, 16
839 ; SI-NEXT: s_and_b32 s5, s8, 0xffff
840 ; SI-NEXT: s_lshl_b32 s6, s9, 4
841 ; SI-NEXT: s_or_b32 s7, s5, s4
842 ; SI-NEXT: s_lshl_b64 s[4:5], 0xffff, s6
843 ; SI-NEXT: v_mov_b32_e32 v4, s7
844 ; SI-NEXT: v_mov_b32_e32 v5, s7
845 ; SI-NEXT: s_waitcnt vmcnt(0)
846 ; SI-NEXT: v_bfi_b32 v3, s5, v4, v3
847 ; SI-NEXT: v_bfi_b32 v2, s4, v5, v2
848 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
851 ; VI-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
853 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
854 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
855 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
856 ; VI-NEXT: s_waitcnt lgkmcnt(0)
857 ; VI-NEXT: v_mov_b32_e32 v1, s3
858 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
859 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
860 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
861 ; VI-NEXT: v_mov_b32_e32 v3, s1
862 ; VI-NEXT: s_lshl_b32 s1, s4, 16
863 ; VI-NEXT: s_and_b32 s2, s4, 0xffff
864 ; VI-NEXT: s_lshl_b32 s3, s5, 4
865 ; VI-NEXT: s_or_b32 s2, s2, s1
866 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
867 ; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s3
868 ; VI-NEXT: v_mov_b32_e32 v4, s2
869 ; VI-NEXT: v_mov_b32_e32 v5, s2
870 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
871 ; VI-NEXT: s_waitcnt vmcnt(0)
872 ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1
873 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0
874 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
877 ; GFX900-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
879 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
880 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
881 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 3, v0
882 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
883 ; GFX900-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
884 ; GFX900-NEXT: s_lshl_b32 s2, s5, 4
885 ; GFX900-NEXT: s_pack_ll_b32_b16 s4, s4, s4
886 ; GFX900-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
887 ; GFX900-NEXT: v_mov_b32_e32 v3, s4
888 ; GFX900-NEXT: v_mov_b32_e32 v4, s4
889 ; GFX900-NEXT: s_waitcnt vmcnt(0)
890 ; GFX900-NEXT: v_bfi_b32 v1, s3, v3, v1
891 ; GFX900-NEXT: v_bfi_b32 v0, s2, v4, v0
892 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
893 ; GFX900-NEXT: s_endpgm
895 ; GFX940-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
897 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
898 ; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
899 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
900 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 3, v0
901 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
902 ; GFX940-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
903 ; GFX940-NEXT: s_lshl_b32 s2, s7, 4
904 ; GFX940-NEXT: s_pack_ll_b32_b16 s4, s6, s6
905 ; GFX940-NEXT: s_lshl_b64 s[2:3], 0xffff, s2
906 ; GFX940-NEXT: v_mov_b32_e32 v3, s4
907 ; GFX940-NEXT: v_mov_b32_e32 v4, s4
908 ; GFX940-NEXT: s_waitcnt vmcnt(0)
909 ; GFX940-NEXT: v_bfi_b32 v1, s3, v3, v1
910 ; GFX940-NEXT: v_bfi_b32 v0, s2, v4, v0
911 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
912 ; GFX940-NEXT: s_endpgm
913 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
914 %tid.ext = sext i32 %tid to i64
915 %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
916 %out.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
917 %vec = load <4 x bfloat>, ptr addrspace(1) %in.gep
918 %val.trunc = trunc i32 %val to i16
919 %val.cvt = bitcast i16 %val.trunc to bfloat
920 %vecins = insertelement <4 x bfloat> %vec, bfloat %val.cvt, i32 %idxval
921 store <4 x bfloat> %vecins, ptr addrspace(1) %out.gep
925 define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
926 ; SI-LABEL: v_insertelement_v8bf16_3:
928 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
929 ; SI-NEXT: s_mov_b32 s7, 0x100f000
930 ; SI-NEXT: s_mov_b32 s6, 0
931 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
932 ; SI-NEXT: v_mov_b32_e32 v5, 0
933 ; SI-NEXT: s_waitcnt lgkmcnt(0)
934 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
935 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
936 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
937 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
938 ; SI-NEXT: s_waitcnt lgkmcnt(0)
939 ; SI-NEXT: s_lshl_b32 s4, s8, 16
940 ; SI-NEXT: s_waitcnt vmcnt(0)
941 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
942 ; SI-NEXT: v_or_b32_e32 v1, s4, v1
943 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
946 ; VI-LABEL: v_insertelement_v8bf16_3:
948 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
949 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
950 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
951 ; VI-NEXT: s_waitcnt lgkmcnt(0)
952 ; VI-NEXT: v_mov_b32_e32 v1, s3
953 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
954 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
955 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
956 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
957 ; VI-NEXT: s_lshl_b32 s0, s4, 16
958 ; VI-NEXT: v_mov_b32_e32 v5, s1
959 ; VI-NEXT: v_mov_b32_e32 v6, s0
960 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
961 ; VI-NEXT: s_waitcnt vmcnt(0)
962 ; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
963 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
966 ; GFX900-LABEL: v_insertelement_v8bf16_3:
968 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
969 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10
970 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
971 ; GFX900-NEXT: v_mov_b32_e32 v5, 0x5040100
972 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
973 ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
974 ; GFX900-NEXT: s_waitcnt vmcnt(0)
975 ; GFX900-NEXT: v_perm_b32 v1, s4, v1, v5
976 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
977 ; GFX900-NEXT: s_endpgm
979 ; GFX940-LABEL: v_insertelement_v8bf16_3:
981 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
982 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10
983 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
984 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0
985 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x5040100
986 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
987 ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
988 ; GFX940-NEXT: s_waitcnt vmcnt(0)
989 ; GFX940-NEXT: v_perm_b32 v1, s6, v1, v5
990 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
991 ; GFX940-NEXT: s_endpgm
992 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
993 %tid.ext = sext i32 %tid to i64
994 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
995 %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
996 %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep
997 %val.trunc = trunc i32 %val to i16
998 %val.cvt = bitcast i16 %val.trunc to bfloat
999 %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 3
1000 store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep
1004 define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
1005 ; SI-LABEL: v_insertelement_v8bf16_dynamic:
1007 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1008 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1009 ; SI-NEXT: s_mov_b32 s6, 0
1010 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1011 ; SI-NEXT: v_mov_b32_e32 v5, 0
1012 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1013 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1014 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
1015 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x4
1016 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1017 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1018 ; SI-NEXT: s_cmp_eq_u32 s9, 6
1019 ; SI-NEXT: v_mov_b32_e32 v6, s8
1020 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1021 ; SI-NEXT: s_cmp_eq_u32 s9, 7
1022 ; SI-NEXT: s_waitcnt vmcnt(0)
1023 ; SI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
1024 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1025 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1026 ; SI-NEXT: s_cmp_eq_u32 s9, 4
1027 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
1028 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1029 ; SI-NEXT: s_cmp_eq_u32 s9, 5
1030 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
1031 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
1032 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1033 ; SI-NEXT: s_cmp_eq_u32 s9, 2
1034 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
1035 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1036 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
1037 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1038 ; SI-NEXT: s_cmp_eq_u32 s9, 3
1039 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
1040 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1041 ; SI-NEXT: v_or_b32_e32 v3, v7, v3
1042 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
1043 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
1044 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1045 ; SI-NEXT: s_cmp_eq_u32 s9, 0
1046 ; SI-NEXT: v_or_b32_e32 v2, v2, v7
1047 ; SI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
1048 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1049 ; SI-NEXT: s_cmp_eq_u32 s9, 1
1050 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
1051 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1052 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1053 ; SI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
1054 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1055 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
1056 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1057 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1058 ; SI-NEXT: v_or_b32_e32 v1, v1, v7
1059 ; SI-NEXT: v_or_b32_e32 v0, v0, v6
1060 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
1063 ; VI-LABEL: v_insertelement_v8bf16_dynamic:
1065 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1066 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1067 ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1068 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1069 ; VI-NEXT: v_mov_b32_e32 v1, s3
1070 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1071 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1072 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1073 ; VI-NEXT: v_mov_b32_e32 v5, s1
1074 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
1075 ; VI-NEXT: s_cmp_eq_u32 s5, 6
1076 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1077 ; VI-NEXT: v_mov_b32_e32 v6, s4
1078 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1079 ; VI-NEXT: s_cmp_eq_u32 s5, 7
1080 ; VI-NEXT: s_waitcnt vmcnt(0)
1081 ; VI-NEXT: v_cndmask_b32_e32 v7, v3, v6, vcc
1082 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1083 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1084 ; VI-NEXT: s_cmp_eq_u32 s5, 4
1085 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
1086 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1087 ; VI-NEXT: s_cmp_eq_u32 s5, 5
1088 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2
1089 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
1090 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1091 ; VI-NEXT: s_cmp_eq_u32 s5, 2
1092 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1093 ; VI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
1094 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1095 ; VI-NEXT: s_cmp_eq_u32 s5, 3
1096 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
1097 ; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1098 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
1099 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
1100 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1101 ; VI-NEXT: s_cmp_eq_u32 s5, 0
1102 ; VI-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1103 ; VI-NEXT: v_cndmask_b32_e32 v7, v9, v6, vcc
1104 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1105 ; VI-NEXT: s_cmp_eq_u32 s5, 1
1106 ; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
1107 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1108 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1109 ; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
1110 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
1111 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1112 ; VI-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1113 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1114 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1117 ; GFX900-LABEL: v_insertelement_v8bf16_dynamic:
1119 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1120 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1121 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1122 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1123 ; GFX900-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
1124 ; GFX900-NEXT: s_cmp_eq_u32 s5, 6
1125 ; GFX900-NEXT: v_mov_b32_e32 v5, s4
1126 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1127 ; GFX900-NEXT: s_cmp_eq_u32 s5, 7
1128 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100
1129 ; GFX900-NEXT: s_waitcnt vmcnt(0)
1130 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc
1131 ; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1132 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1133 ; GFX900-NEXT: s_cmp_eq_u32 s5, 4
1134 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
1135 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1136 ; GFX900-NEXT: s_cmp_eq_u32 s5, 5
1137 ; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v2
1138 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
1139 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1140 ; GFX900-NEXT: s_cmp_eq_u32 s5, 2
1141 ; GFX900-NEXT: v_perm_b32 v3, v3, v6, s2
1142 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc
1143 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1144 ; GFX900-NEXT: s_cmp_eq_u32 s5, 3
1145 ; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1146 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
1147 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1148 ; GFX900-NEXT: s_cmp_eq_u32 s5, 0
1149 ; GFX900-NEXT: v_perm_b32 v2, v6, v2, s2
1150 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc
1151 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1152 ; GFX900-NEXT: s_cmp_eq_u32 s5, 1
1153 ; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1154 ; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
1155 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1156 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
1157 ; GFX900-NEXT: v_perm_b32 v1, v6, v1, s2
1158 ; GFX900-NEXT: v_perm_b32 v0, v5, v0, s2
1159 ; GFX900-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
1160 ; GFX900-NEXT: s_endpgm
1162 ; GFX940-LABEL: v_insertelement_v8bf16_dynamic:
1164 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1165 ; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1166 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1167 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1168 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX940-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
1170 ; GFX940-NEXT: s_cmp_eq_u32 s7, 6
1171 ; GFX940-NEXT: v_mov_b32_e32 v5, s6
1172 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1173 ; GFX940-NEXT: s_cmp_eq_u32 s7, 7
1174 ; GFX940-NEXT: s_mov_b32 s2, 0x5040100
1175 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1176 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v5, vcc
1177 ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1178 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1179 ; GFX940-NEXT: s_cmp_eq_u32 s7, 4
1180 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
1181 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1182 ; GFX940-NEXT: s_cmp_eq_u32 s7, 5
1183 ; GFX940-NEXT: v_lshrrev_b32_e32 v7, 16, v2
1184 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
1185 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1186 ; GFX940-NEXT: s_cmp_eq_u32 s7, 2
1187 ; GFX940-NEXT: v_perm_b32 v3, v3, v6, s2
1188 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v5, vcc
1189 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1190 ; GFX940-NEXT: s_cmp_eq_u32 s7, 3
1191 ; GFX940-NEXT: v_lshrrev_b32_e32 v8, 16, v1
1192 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
1193 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1194 ; GFX940-NEXT: s_cmp_eq_u32 s7, 0
1195 ; GFX940-NEXT: v_perm_b32 v2, v6, v2, s2
1196 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v5, vcc
1197 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1198 ; GFX940-NEXT: s_cmp_eq_u32 s7, 1
1199 ; GFX940-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1200 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
1201 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1202 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
1203 ; GFX940-NEXT: v_perm_b32 v1, v6, v1, s2
1204 ; GFX940-NEXT: v_perm_b32 v0, v5, v0, s2
1205 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
1206 ; GFX940-NEXT: s_endpgm
1207 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1208 %tid.ext = sext i32 %tid to i64
1209 %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1210 %out.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1211 %vec = load <8 x bfloat>, ptr addrspace(1) %in.gep
1212 %val.trunc = trunc i32 %val to i16
1213 %val.cvt = bitcast i16 %val.trunc to bfloat
1214 %vecins = insertelement <8 x bfloat> %vec, bfloat %val.cvt, i32 %n
1215 store <8 x bfloat> %vecins, ptr addrspace(1) %out.gep
1219 define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val) {
1220 ; SI-LABEL: v_insertelement_v16bf16_3:
1222 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1223 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1224 ; SI-NEXT: s_mov_b32 s6, 0
1225 ; SI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1226 ; SI-NEXT: v_mov_b32_e32 v9, 0
1227 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1228 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
1229 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64
1230 ; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[4:7], 0 addr64 offset:16
1231 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
1232 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
1233 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1234 ; SI-NEXT: s_lshl_b32 s4, s8, 16
1235 ; SI-NEXT: s_waitcnt vmcnt(1)
1236 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1237 ; SI-NEXT: v_or_b32_e32 v1, s4, v1
1238 ; SI-NEXT: s_waitcnt vmcnt(0)
1239 ; SI-NEXT: buffer_store_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 offset:16
1240 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64
1243 ; VI-LABEL: v_insertelement_v16bf16_3:
1245 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1246 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1247 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1248 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1249 ; VI-NEXT: v_mov_b32_e32 v1, s3
1250 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
1251 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1252 ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
1253 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1254 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1255 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1256 ; VI-NEXT: v_mov_b32_e32 v9, s1
1257 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
1258 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
1259 ; VI-NEXT: s_lshl_b32 s1, s4, 16
1260 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
1261 ; VI-NEXT: v_mov_b32_e32 v12, s1
1262 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
1263 ; VI-NEXT: s_waitcnt vmcnt(1)
1264 ; VI-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1265 ; VI-NEXT: s_waitcnt vmcnt(0)
1266 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
1267 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1270 ; GFX900-LABEL: v_insertelement_v16bf16_3:
1272 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1273 ; GFX900-NEXT: s_load_dword s4, s[8:9], 0x10
1274 ; GFX900-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1275 ; GFX900-NEXT: v_mov_b32_e32 v9, 0x5040100
1276 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1277 ; GFX900-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
1278 ; GFX900-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1279 ; GFX900-NEXT: s_waitcnt vmcnt(1)
1280 ; GFX900-NEXT: v_perm_b32 v1, s4, v1, v9
1281 ; GFX900-NEXT: s_waitcnt vmcnt(0)
1282 ; GFX900-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
1283 ; GFX900-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
1284 ; GFX900-NEXT: s_endpgm
1286 ; GFX940-LABEL: v_insertelement_v16bf16_3:
1288 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1289 ; GFX940-NEXT: s_load_dword s6, s[4:5], 0x10
1290 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1291 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1292 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x5040100
1293 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1294 ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
1295 ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1296 ; GFX940-NEXT: s_waitcnt vmcnt(1)
1297 ; GFX940-NEXT: v_perm_b32 v1, s6, v1, v9
1298 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1299 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1
1300 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1301 ; GFX940-NEXT: s_endpgm
1302 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1303 %tid.ext = sext i32 %tid to i64
1304 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1305 %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1306 %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep
1307 %val.trunc = trunc i32 %val to i16
1308 %val.cvt = bitcast i16 %val.trunc to bfloat
1309 %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 3
1310 store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep
1314 define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
1315 ; SI-LABEL: v_insertelement_v16bf16_dynamic:
1317 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1318 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
1319 ; SI-NEXT: s_mov_b32 s11, 0x100f000
1320 ; SI-NEXT: s_mov_b32 s10, 0
1321 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
1322 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1323 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
1324 ; SI-NEXT: v_mov_b32_e32 v5, 0
1325 ; SI-NEXT: buffer_load_dwordx4 v[7:10], v[4:5], s[8:11], 0 addr64
1326 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16
1327 ; SI-NEXT: s_cmp_eq_u32 s5, 6
1328 ; SI-NEXT: v_mov_b32_e32 v6, s4
1329 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1330 ; SI-NEXT: s_cmp_eq_u32 s5, 7
1331 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1332 ; SI-NEXT: s_waitcnt vmcnt(1)
1333 ; SI-NEXT: v_cndmask_b32_e32 v11, v10, v6, vcc
1334 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
1335 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1336 ; SI-NEXT: s_cmp_eq_u32 s5, 4
1337 ; SI-NEXT: v_cndmask_b32_e32 v10, v10, v6, vcc
1338 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1339 ; SI-NEXT: s_cmp_eq_u32 s5, 5
1340 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v9
1341 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v6, vcc
1342 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1343 ; SI-NEXT: s_cmp_eq_u32 s5, 2
1344 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11
1345 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
1346 ; SI-NEXT: v_cndmask_b32_e32 v12, v12, v6, vcc
1347 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1348 ; SI-NEXT: s_cmp_eq_u32 s5, 3
1349 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
1350 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9
1351 ; SI-NEXT: v_or_b32_e32 v10, v11, v10
1352 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
1353 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
1354 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1355 ; SI-NEXT: s_cmp_eq_u32 s5, 0
1356 ; SI-NEXT: v_or_b32_e32 v9, v9, v11
1357 ; SI-NEXT: v_cndmask_b32_e32 v11, v13, v6, vcc
1358 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1359 ; SI-NEXT: s_cmp_eq_u32 s5, 1
1360 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v7
1361 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
1362 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
1363 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v6, vcc
1364 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1365 ; SI-NEXT: s_cmp_eq_u32 s5, 14
1366 ; SI-NEXT: v_or_b32_e32 v8, v8, v11
1367 ; SI-NEXT: v_cndmask_b32_e32 v11, v14, v6, vcc
1368 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1369 ; SI-NEXT: s_cmp_eq_u32 s5, 15
1370 ; SI-NEXT: s_waitcnt vmcnt(0)
1371 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
1372 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7
1373 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
1374 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
1375 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1376 ; SI-NEXT: s_cmp_eq_u32 s5, 12
1377 ; SI-NEXT: v_or_b32_e32 v7, v7, v11
1378 ; SI-NEXT: v_cndmask_b32_e32 v11, v15, v6, vcc
1379 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1380 ; SI-NEXT: s_cmp_eq_u32 s5, 13
1381 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
1382 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1383 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
1384 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
1385 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1386 ; SI-NEXT: s_cmp_eq_u32 s5, 10
1387 ; SI-NEXT: v_or_b32_e32 v3, v3, v11
1388 ; SI-NEXT: v_cndmask_b32_e32 v11, v16, v6, vcc
1389 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1390 ; SI-NEXT: s_cmp_eq_u32 s5, 11
1391 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
1392 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1393 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
1394 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
1395 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1396 ; SI-NEXT: s_cmp_eq_u32 s5, 8
1397 ; SI-NEXT: v_or_b32_e32 v2, v2, v11
1398 ; SI-NEXT: v_cndmask_b32_e32 v11, v17, v6, vcc
1399 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1400 ; SI-NEXT: s_cmp_eq_u32 s5, 9
1401 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
1402 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
1403 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1404 ; SI-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc
1405 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1406 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
1407 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1408 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1409 ; SI-NEXT: v_or_b32_e32 v1, v1, v11
1410 ; SI-NEXT: v_or_b32_e32 v0, v0, v6
1411 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 offset:16
1412 ; SI-NEXT: buffer_store_dwordx4 v[7:10], v[4:5], s[0:3], 0 addr64
1415 ; VI-LABEL: v_insertelement_v16bf16_dynamic:
1417 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1418 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1419 ; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1421 ; VI-NEXT: v_mov_b32_e32 v0, s3
1422 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
1423 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
1424 ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
1425 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
1426 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1427 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1428 ; VI-NEXT: v_mov_b32_e32 v9, s1
1429 ; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
1430 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
1431 ; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
1432 ; VI-NEXT: s_cmp_eq_u32 s5, 14
1433 ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
1434 ; VI-NEXT: v_mov_b32_e32 v12, s4
1435 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1436 ; VI-NEXT: s_cmp_eq_u32 s5, 15
1437 ; VI-NEXT: s_waitcnt vmcnt(1)
1438 ; VI-NEXT: v_cndmask_b32_e32 v13, v3, v12, vcc
1439 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1440 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1441 ; VI-NEXT: s_cmp_eq_u32 s5, 12
1442 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
1443 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1444 ; VI-NEXT: s_cmp_eq_u32 s5, 13
1445 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
1446 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
1447 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1448 ; VI-NEXT: s_cmp_eq_u32 s5, 10
1449 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1450 ; VI-NEXT: v_cndmask_b32_e32 v14, v14, v12, vcc
1451 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1452 ; VI-NEXT: s_cmp_eq_u32 s5, 11
1453 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v1
1454 ; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1455 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
1456 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc
1457 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1458 ; VI-NEXT: s_cmp_eq_u32 s5, 8
1459 ; VI-NEXT: v_or_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1460 ; VI-NEXT: v_cndmask_b32_e32 v13, v15, v12, vcc
1461 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1462 ; VI-NEXT: s_cmp_eq_u32 s5, 9
1463 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
1464 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
1465 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
1466 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1467 ; VI-NEXT: s_cmp_eq_u32 s5, 6
1468 ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1469 ; VI-NEXT: v_cndmask_b32_e32 v13, v16, v12, vcc
1470 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1471 ; VI-NEXT: s_cmp_eq_u32 s5, 7
1472 ; VI-NEXT: s_waitcnt vmcnt(0)
1473 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v7
1474 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
1475 ; VI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
1476 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1477 ; VI-NEXT: s_cmp_eq_u32 s5, 4
1478 ; VI-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1479 ; VI-NEXT: v_cndmask_b32_e32 v13, v17, v12, vcc
1480 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1481 ; VI-NEXT: s_cmp_eq_u32 s5, 5
1482 ; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v6
1483 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
1484 ; VI-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc
1485 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1486 ; VI-NEXT: s_cmp_eq_u32 s5, 2
1487 ; VI-NEXT: v_or_b32_sdwa v7, v7, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1488 ; VI-NEXT: v_cndmask_b32_e32 v13, v18, v12, vcc
1489 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1490 ; VI-NEXT: s_cmp_eq_u32 s5, 3
1491 ; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
1492 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
1493 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc
1494 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1495 ; VI-NEXT: s_cmp_eq_u32 s5, 0
1496 ; VI-NEXT: v_or_b32_sdwa v6, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1497 ; VI-NEXT: v_cndmask_b32_e32 v13, v19, v12, vcc
1498 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1499 ; VI-NEXT: s_cmp_eq_u32 s5, 1
1500 ; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v4
1501 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
1502 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1503 ; VI-NEXT: v_cndmask_b32_e32 v12, v20, v12, vcc
1504 ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
1505 ; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
1506 ; VI-NEXT: v_or_b32_sdwa v5, v5, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1507 ; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1508 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
1509 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
1512 ; GFX900-LABEL: v_insertelement_v16bf16_dynamic:
1514 ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1515 ; GFX900-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
1516 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 5, v0
1517 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1518 ; GFX900-NEXT: global_load_dwordx4 v[1:4], v0, s[2:3]
1519 ; GFX900-NEXT: global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
1520 ; GFX900-NEXT: s_cmp_eq_u32 s5, 6
1521 ; GFX900-NEXT: v_mov_b32_e32 v9, s4
1522 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1523 ; GFX900-NEXT: s_cmp_eq_u32 s5, 7
1524 ; GFX900-NEXT: s_mov_b32 s2, 0x5040100
1525 ; GFX900-NEXT: s_waitcnt vmcnt(1)
1526 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v4, v9, vcc
1527 ; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4
1528 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1529 ; GFX900-NEXT: s_cmp_eq_u32 s5, 4
1530 ; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
1531 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1532 ; GFX900-NEXT: s_cmp_eq_u32 s5, 5
1533 ; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v3
1534 ; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
1535 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1536 ; GFX900-NEXT: s_cmp_eq_u32 s5, 2
1537 ; GFX900-NEXT: v_perm_b32 v4, v4, v10, s2
1538 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc
1539 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1540 ; GFX900-NEXT: s_cmp_eq_u32 s5, 3
1541 ; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v2
1542 ; GFX900-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
1543 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1544 ; GFX900-NEXT: s_cmp_eq_u32 s5, 0
1545 ; GFX900-NEXT: v_perm_b32 v3, v10, v3, s2
1546 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
1547 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1548 ; GFX900-NEXT: s_cmp_eq_u32 s5, 1
1549 ; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v1
1550 ; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
1551 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1552 ; GFX900-NEXT: s_cmp_eq_u32 s5, 14
1553 ; GFX900-NEXT: v_perm_b32 v2, v10, v2, s2
1554 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
1555 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1556 ; GFX900-NEXT: s_cmp_eq_u32 s5, 15
1557 ; GFX900-NEXT: s_waitcnt vmcnt(0)
1558 ; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v8
1559 ; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc
1560 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1561 ; GFX900-NEXT: s_cmp_eq_u32 s5, 12
1562 ; GFX900-NEXT: v_perm_b32 v1, v10, v1, s2
1563 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
1564 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1565 ; GFX900-NEXT: s_cmp_eq_u32 s5, 13
1566 ; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v7
1567 ; GFX900-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
1568 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1569 ; GFX900-NEXT: s_cmp_eq_u32 s5, 10
1570 ; GFX900-NEXT: v_perm_b32 v8, v10, v8, s2
1571 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
1572 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1573 ; GFX900-NEXT: s_cmp_eq_u32 s5, 11
1574 ; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v6
1575 ; GFX900-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
1576 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1577 ; GFX900-NEXT: s_cmp_eq_u32 s5, 8
1578 ; GFX900-NEXT: v_perm_b32 v7, v10, v7, s2
1579 ; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
1580 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1581 ; GFX900-NEXT: s_cmp_eq_u32 s5, 9
1582 ; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v5
1583 ; GFX900-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
1584 ; GFX900-NEXT: s_cselect_b64 vcc, -1, 0
1585 ; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
1586 ; GFX900-NEXT: v_perm_b32 v6, v10, v6, s2
1587 ; GFX900-NEXT: v_perm_b32 v5, v9, v5, s2
1588 ; GFX900-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
1589 ; GFX900-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
1590 ; GFX900-NEXT: s_endpgm
1592 ; GFX940-LABEL: v_insertelement_v16bf16_dynamic:
1594 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1595 ; GFX940-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1596 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1597 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 5, v0
1598 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1599 ; GFX940-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3]
1600 ; GFX940-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
1601 ; GFX940-NEXT: s_cmp_eq_u32 s7, 6
1602 ; GFX940-NEXT: v_mov_b32_e32 v9, s6
1603 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1604 ; GFX940-NEXT: s_cmp_eq_u32 s7, 7
1605 ; GFX940-NEXT: s_mov_b32 s2, 0x5040100
1606 ; GFX940-NEXT: s_waitcnt vmcnt(1)
1607 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v3, v9, vcc
1608 ; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1609 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1610 ; GFX940-NEXT: s_cmp_eq_u32 s7, 4
1611 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
1612 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1613 ; GFX940-NEXT: s_cmp_eq_u32 s7, 5
1614 ; GFX940-NEXT: v_lshrrev_b32_e32 v11, 16, v2
1615 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc
1616 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1617 ; GFX940-NEXT: s_cmp_eq_u32 s7, 2
1618 ; GFX940-NEXT: v_perm_b32 v3, v3, v10, s2
1619 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v11, v9, vcc
1620 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1621 ; GFX940-NEXT: s_cmp_eq_u32 s7, 3
1622 ; GFX940-NEXT: v_lshrrev_b32_e32 v12, 16, v1
1623 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
1624 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1625 ; GFX940-NEXT: s_cmp_eq_u32 s7, 0
1626 ; GFX940-NEXT: v_perm_b32 v2, v10, v2, s2
1627 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v12, v9, vcc
1628 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1629 ; GFX940-NEXT: s_cmp_eq_u32 s7, 1
1630 ; GFX940-NEXT: v_lshrrev_b32_e32 v13, 16, v0
1631 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
1632 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1633 ; GFX940-NEXT: s_cmp_eq_u32 s7, 14
1634 ; GFX940-NEXT: v_perm_b32 v1, v10, v1, s2
1635 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v13, v9, vcc
1636 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1637 ; GFX940-NEXT: s_cmp_eq_u32 s7, 15
1638 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1639 ; GFX940-NEXT: v_lshrrev_b32_e32 v14, 16, v7
1640 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
1641 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1642 ; GFX940-NEXT: s_cmp_eq_u32 s7, 12
1643 ; GFX940-NEXT: v_perm_b32 v0, v10, v0, s2
1644 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v14, v9, vcc
1645 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1646 ; GFX940-NEXT: s_cmp_eq_u32 s7, 13
1647 ; GFX940-NEXT: v_lshrrev_b32_e32 v15, 16, v6
1648 ; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
1649 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1650 ; GFX940-NEXT: s_cmp_eq_u32 s7, 10
1651 ; GFX940-NEXT: v_perm_b32 v7, v10, v7, s2
1652 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v15, v9, vcc
1653 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1654 ; GFX940-NEXT: s_cmp_eq_u32 s7, 11
1655 ; GFX940-NEXT: v_lshrrev_b32_e32 v16, 16, v5
1656 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
1657 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1658 ; GFX940-NEXT: s_cmp_eq_u32 s7, 8
1659 ; GFX940-NEXT: v_perm_b32 v6, v10, v6, s2
1660 ; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v9, vcc
1661 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1662 ; GFX940-NEXT: s_cmp_eq_u32 s7, 9
1663 ; GFX940-NEXT: v_lshrrev_b32_e32 v17, 16, v4
1664 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
1665 ; GFX940-NEXT: s_cselect_b64 vcc, -1, 0
1666 ; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc
1667 ; GFX940-NEXT: v_perm_b32 v5, v10, v5, s2
1668 ; GFX940-NEXT: v_perm_b32 v4, v9, v4, s2
1669 ; GFX940-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 sc0 sc1
1670 ; GFX940-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
1671 ; GFX940-NEXT: s_endpgm
1672 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1673 %tid.ext = sext i32 %tid to i64
1674 %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
1675 %out.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %out, i64 %tid.ext
1676 %vec = load <16 x bfloat>, ptr addrspace(1) %in.gep
1677 %val.trunc = trunc i32 %val to i16
1678 %val.cvt = bitcast i16 %val.trunc to bfloat
1679 %vecins = insertelement <16 x bfloat> %vec, bfloat %val.cvt, i32 %n
1680 store <16 x bfloat> %vecins, ptr addrspace(1) %out.gep
1684 declare i32 @llvm.amdgcn.workitem.id.x() #1
1686 attributes #0 = { nounwind }
1687 attributes #1 = { nounwind readnone }