1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
7 ; SI-LABEL: extract_vector_elt_v2f16:
9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
10 ; SI-NEXT: s_waitcnt lgkmcnt(0)
11 ; SI-NEXT: s_load_dword s4, s[2:3], 0x0
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: s_lshr_b32 s5, s4, 16
15 ; SI-NEXT: s_mov_b32 s2, -1
16 ; SI-NEXT: v_mov_b32_e32 v0, s4
17 ; SI-NEXT: v_mov_b32_e32 v1, s5
18 ; SI-NEXT: buffer_store_short v1, off, s[0:3], 0
19 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20
22 ; VI-LABEL: extract_vector_elt_v2f16:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
25 ; VI-NEXT: s_waitcnt lgkmcnt(0)
26 ; VI-NEXT: s_load_dword s4, s[2:3], 0x0
27 ; VI-NEXT: s_mov_b32 s3, 0xf000
28 ; VI-NEXT: s_mov_b32 s2, -1
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: s_lshr_b32 s5, s4, 16
31 ; VI-NEXT: v_mov_b32_e32 v1, s5
32 ; VI-NEXT: v_mov_b32_e32 v0, s4
33 ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
34 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20
37 ; GFX11-LABEL: extract_vector_elt_v2f16:
39 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
40 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
41 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
42 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
43 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
44 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
46 ; GFX11-NEXT: s_mov_b32 s2, -1
47 ; GFX11-NEXT: s_clause 0x1
48 ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
49 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20
51 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
52 ; GFX11-NEXT: s_endpgm
53 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
54 %p0 = extractelement <2 x half> %vec, i32 0
55 %p1 = extractelement <2 x half> %vec, i32 1
56 %out1 = getelementptr half, ptr addrspace(1) %out, i32 10
57 store half %p1, ptr addrspace(1) %out, align 2
58 store half %p0, ptr addrspace(1) %out1, align 2
62 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
63 ; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
65 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
66 ; SI-NEXT: s_load_dword s0, s[2:3], 0xd
67 ; SI-NEXT: s_waitcnt lgkmcnt(0)
68 ; SI-NEXT: s_load_dword s1, s[6:7], 0x0
69 ; SI-NEXT: s_mov_b32 s7, 0xf000
70 ; SI-NEXT: s_lshl_b32 s0, s0, 4
71 ; SI-NEXT: s_waitcnt lgkmcnt(0)
72 ; SI-NEXT: s_lshr_b32 s0, s1, s0
73 ; SI-NEXT: s_mov_b32 s6, -1
74 ; SI-NEXT: v_mov_b32_e32 v0, s0
75 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
78 ; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
80 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
81 ; VI-NEXT: s_load_dword s8, s[2:3], 0x34
82 ; VI-NEXT: s_mov_b32 s3, 0xf000
83 ; VI-NEXT: s_mov_b32 s2, -1
84 ; VI-NEXT: s_waitcnt lgkmcnt(0)
85 ; VI-NEXT: s_load_dword s6, s[6:7], 0x0
86 ; VI-NEXT: s_mov_b32 s0, s4
87 ; VI-NEXT: s_lshl_b32 s4, s8, 4
88 ; VI-NEXT: s_mov_b32 s1, s5
89 ; VI-NEXT: s_waitcnt lgkmcnt(0)
90 ; VI-NEXT: s_lshr_b32 s4, s6, s4
91 ; VI-NEXT: v_mov_b32_e32 v0, s4
92 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
95 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
97 ; GFX11-NEXT: s_clause 0x1
98 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
99 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34
100 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
102 ; GFX11-NEXT: s_lshl_b32 s0, s0, 4
103 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
104 ; GFX11-NEXT: s_mov_b32 s6, -1
105 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX11-NEXT: s_lshr_b32 s0, s1, s0
107 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
108 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
109 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
110 ; GFX11-NEXT: s_nop 0
111 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
112 ; GFX11-NEXT: s_endpgm
113 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
114 %elt = extractelement <2 x half> %vec, i32 %idx
115 store half %elt, ptr addrspace(1) %out, align 2
119 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 {
120 ; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
122 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
123 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
124 ; SI-NEXT: s_mov_b32 s3, 0xf000
125 ; SI-NEXT: s_mov_b32 s2, 0
126 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
127 ; SI-NEXT: v_mov_b32_e32 v2, 0
128 ; SI-NEXT: s_waitcnt lgkmcnt(0)
129 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
130 ; SI-NEXT: s_load_dword s6, s[6:7], 0x0
131 ; SI-NEXT: s_mov_b64 s[0:1], s[4:5]
132 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
133 ; SI-NEXT: s_waitcnt vmcnt(0)
134 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v3
135 ; SI-NEXT: s_waitcnt lgkmcnt(0)
136 ; SI-NEXT: v_lshr_b32_e32 v0, s6, v0
137 ; SI-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64
140 ; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
142 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
143 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
144 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
145 ; VI-NEXT: s_waitcnt lgkmcnt(0)
146 ; VI-NEXT: v_mov_b32_e32 v2, s1
147 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
148 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
149 ; VI-NEXT: flat_load_dword v2, v[1:2]
150 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
151 ; VI-NEXT: s_waitcnt lgkmcnt(0)
152 ; VI-NEXT: v_mov_b32_e32 v1, s1
153 ; VI-NEXT: s_load_dword s1, s[2:3], 0x0
154 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
155 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
156 ; VI-NEXT: s_waitcnt vmcnt(0)
157 ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
158 ; VI-NEXT: s_waitcnt lgkmcnt(0)
159 ; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1
160 ; VI-NEXT: flat_store_short v[0:1], v2
163 ; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
165 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
166 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
167 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
168 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
169 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
170 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX11-NEXT: global_load_b32 v1, v1, s[0:1]
172 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
173 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
175 ; GFX11-NEXT: s_waitcnt vmcnt(0)
176 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
177 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
179 ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
180 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
181 ; GFX11-NEXT: s_nop 0
182 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
183 ; GFX11-NEXT: s_endpgm
184 %tid = call i32 @llvm.amdgcn.workitem.id.x()
185 %tid.ext = sext i32 %tid to i64
186 %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
187 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
188 %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
189 %idx = load i32, ptr addrspace(1) %gep
190 %elt = extractelement <2 x half> %vec, i32 %idx
191 store half %elt, ptr addrspace(1) %out.gep, align 2
195 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 {
196 ; SI-LABEL: extract_vector_elt_v3f16:
198 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
199 ; SI-NEXT: s_mov_b32 s7, 0xf000
200 ; SI-NEXT: s_mov_b32 s6, -1
201 ; SI-NEXT: s_waitcnt lgkmcnt(0)
202 ; SI-NEXT: s_mov_b32 s4, s0
203 ; SI-NEXT: s_mov_b32 s5, s1
204 ; SI-NEXT: v_mov_b32_e32 v0, s3
205 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
206 ; SI-NEXT: s_waitcnt expcnt(0)
207 ; SI-NEXT: v_mov_b32_e32 v0, s2
208 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
211 ; VI-LABEL: extract_vector_elt_v3f16:
213 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
214 ; VI-NEXT: s_mov_b32 s7, 0xf000
215 ; VI-NEXT: s_mov_b32 s6, -1
216 ; VI-NEXT: s_waitcnt lgkmcnt(0)
217 ; VI-NEXT: s_mov_b32 s4, s0
218 ; VI-NEXT: s_mov_b32 s5, s1
219 ; VI-NEXT: v_mov_b32_e32 v0, s3
220 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
221 ; VI-NEXT: v_mov_b32_e32 v0, s2
222 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
225 ; GFX11-LABEL: extract_vector_elt_v3f16:
227 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
228 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
229 ; GFX11-NEXT: s_mov_b32 s6, -1
230 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
232 ; GFX11-NEXT: s_mov_b32 s4, s0
233 ; GFX11-NEXT: s_mov_b32 s5, s1
234 ; GFX11-NEXT: s_clause 0x1
235 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
236 ; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
237 ; GFX11-NEXT: s_nop 0
238 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
239 ; GFX11-NEXT: s_endpgm
240 %p0 = extractelement <3 x half> %foo, i32 0
241 %p1 = extractelement <3 x half> %foo, i32 2
242 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
243 store half %p1, ptr addrspace(1) %out, align 2
244 store half %p0, ptr addrspace(1) %out1, align 2
248 ; FIXME: Why sometimes vector shift?
249 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 {
250 ; SI-LABEL: dynamic_extract_vector_elt_v3f16:
252 ; SI-NEXT: s_load_dword s4, s[2:3], 0xd
253 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
254 ; SI-NEXT: s_mov_b32 s7, 0xf000
255 ; SI-NEXT: s_waitcnt lgkmcnt(0)
256 ; SI-NEXT: s_lshl_b32 s4, s4, 4
257 ; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
258 ; SI-NEXT: s_mov_b32 s6, -1
259 ; SI-NEXT: s_mov_b32 s4, s0
260 ; SI-NEXT: s_mov_b32 s5, s1
261 ; SI-NEXT: v_mov_b32_e32 v0, s2
262 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
265 ; VI-LABEL: dynamic_extract_vector_elt_v3f16:
267 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
268 ; VI-NEXT: s_load_dword s8, s[2:3], 0x34
269 ; VI-NEXT: s_mov_b32 s3, 0xf000
270 ; VI-NEXT: s_mov_b32 s2, -1
271 ; VI-NEXT: s_waitcnt lgkmcnt(0)
272 ; VI-NEXT: s_mov_b32 s0, s4
273 ; VI-NEXT: s_lshl_b32 s4, s8, 4
274 ; VI-NEXT: s_mov_b32 s1, s5
275 ; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4
276 ; VI-NEXT: v_mov_b32_e32 v0, s4
277 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
280 ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
282 ; GFX11-NEXT: s_clause 0x1
283 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x34
284 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
285 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX11-NEXT: s_lshl_b32 s4, s4, 4
287 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
288 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
289 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
290 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
291 ; GFX11-NEXT: s_mov_b32 s2, -1
292 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
293 ; GFX11-NEXT: s_nop 0
294 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
295 ; GFX11-NEXT: s_endpgm
296 %p0 = extractelement <3 x half> %foo, i32 %idx
297 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
298 store half %p0, ptr addrspace(1) %out
302 define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
303 ; SI-LABEL: v_extractelement_v4f16_2:
305 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
306 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
307 ; SI-NEXT: s_mov_b32 s6, 0
308 ; SI-NEXT: s_mov_b32 s7, 0xf000
309 ; SI-NEXT: v_mov_b32_e32 v2, 0
310 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
311 ; SI-NEXT: s_waitcnt lgkmcnt(0)
312 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
313 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
314 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
315 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
316 ; SI-NEXT: s_waitcnt vmcnt(0)
317 ; SI-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
320 ; VI-LABEL: v_extractelement_v4f16_2:
322 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
323 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
324 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
325 ; VI-NEXT: s_waitcnt lgkmcnt(0)
326 ; VI-NEXT: v_mov_b32_e32 v2, s3
327 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
328 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
329 ; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1
330 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
331 ; VI-NEXT: flat_load_dword v2, v[1:2]
332 ; VI-NEXT: v_mov_b32_e32 v1, s1
333 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
334 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
335 ; VI-NEXT: s_waitcnt vmcnt(0)
336 ; VI-NEXT: flat_store_short v[0:1], v2
339 ; GFX11-LABEL: v_extractelement_v4f16_2:
341 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
342 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
343 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
344 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
345 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
346 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
347 ; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4
348 ; GFX11-NEXT: s_waitcnt vmcnt(0)
349 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
350 ; GFX11-NEXT: s_nop 0
351 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
352 ; GFX11-NEXT: s_endpgm
353 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
354 %tid.ext = sext i32 %tid to i64
355 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
356 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
357 %vec = load <4 x half>, ptr addrspace(1) %in.gep
358 %vec.extract = extractelement <4 x half> %vec, i32 2
359 store half %vec.extract, ptr addrspace(1) %out.gep
363 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
364 ; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
366 ; SI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
367 ; SI-NEXT: s_mov_b32 s6, 0
368 ; SI-NEXT: s_mov_b32 s7, 0xf000
369 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
370 ; SI-NEXT: v_mov_b32_e32 v2, 0
371 ; SI-NEXT: s_mov_b32 s10, -1
372 ; SI-NEXT: s_mov_b32 s11, s7
373 ; SI-NEXT: buffer_load_dword v5, off, s[8:11], 0 glc
374 ; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
375 ; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
376 ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
377 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
378 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
379 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v5
380 ; SI-NEXT: s_waitcnt vmcnt(0)
381 ; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0
382 ; SI-NEXT: buffer_store_short v3, v[1:2], s[0:3], 0 addr64
385 ; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
387 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
388 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
389 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
390 ; VI-NEXT: s_waitcnt lgkmcnt(0)
391 ; VI-NEXT: v_mov_b32_e32 v2, s3
392 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
393 ; VI-NEXT: s_mov_b32 s3, 0xf000
394 ; VI-NEXT: s_mov_b32 s2, -1
395 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
396 ; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
397 ; VI-NEXT: s_waitcnt vmcnt(0)
398 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
399 ; VI-NEXT: v_mov_b32_e32 v5, s1
400 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3
401 ; VI-NEXT: s_waitcnt vmcnt(0)
402 ; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2]
403 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4
404 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
405 ; VI-NEXT: flat_store_short v[1:2], v0
408 ; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr:
410 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
411 ; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0
412 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
413 ; GFX11-NEXT: s_mov_b32 s6, -1
414 ; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc
415 ; GFX11-NEXT: s_waitcnt vmcnt(0)
416 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2
417 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
419 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3
420 ; GFX11-NEXT: s_waitcnt vmcnt(0)
421 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
422 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v3, v[0:1]
423 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v2
424 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
425 ; GFX11-NEXT: s_nop 0
426 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
427 ; GFX11-NEXT: s_endpgm
428 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
429 %tid.ext = sext i32 %tid to i64
430 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
431 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
432 %idx.val = load volatile i32, ptr addrspace(1) undef
433 %vec = load <4 x half>, ptr addrspace(1) %in.gep
434 %vec.extract = extractelement <4 x half> %vec, i32 %idx.val
435 store half %vec.extract, ptr addrspace(1) %out.gep
439 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 {
440 ; SI-LABEL: reduce_load_vector_v8f16_extract_01:
442 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
443 ; SI-NEXT: s_waitcnt lgkmcnt(0)
444 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0
445 ; SI-NEXT: s_mov_b32 s3, 0xf000
446 ; SI-NEXT: s_mov_b32 s2, -1
447 ; SI-NEXT: s_waitcnt lgkmcnt(0)
448 ; SI-NEXT: s_lshr_b32 s1, s0, 16
449 ; SI-NEXT: v_mov_b32_e32 v0, s0
450 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
451 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
452 ; SI-NEXT: v_mov_b32_e32 v0, s1
453 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
454 ; SI-NEXT: s_waitcnt vmcnt(0)
457 ; VI-LABEL: reduce_load_vector_v8f16_extract_01:
459 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
460 ; VI-NEXT: s_mov_b32 s3, 0xf000
461 ; VI-NEXT: s_mov_b32 s2, -1
462 ; VI-NEXT: s_waitcnt lgkmcnt(0)
463 ; VI-NEXT: s_load_dword s0, s[0:1], 0x0
464 ; VI-NEXT: s_waitcnt lgkmcnt(0)
465 ; VI-NEXT: s_lshr_b32 s1, s0, 16
466 ; VI-NEXT: v_mov_b32_e32 v0, s0
467 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
468 ; VI-NEXT: s_waitcnt vmcnt(0)
469 ; VI-NEXT: v_mov_b32_e32 v0, s1
470 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
471 ; VI-NEXT: s_waitcnt vmcnt(0)
474 ; GFX11-LABEL: reduce_load_vector_v8f16_extract_01:
476 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
477 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
478 ; GFX11-NEXT: s_mov_b32 s2, -1
479 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
480 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
481 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
483 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
484 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
485 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
486 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
487 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc
488 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
489 ; GFX11-NEXT: s_nop 0
490 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
491 ; GFX11-NEXT: s_endpgm
492 %load = load <16 x half>, ptr addrspace(4) %ptr
493 %elt0 = extractelement <16 x half> %load, i32 0
494 %elt1 = extractelement <16 x half> %load, i32 1
495 store volatile half %elt0, ptr addrspace(1) undef, align 2
496 store volatile half %elt1, ptr addrspace(1) undef, align 2
500 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 {
501 ; SI-LABEL: reduce_load_vector_v8f16_extract_23:
503 ; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
504 ; SI-NEXT: s_waitcnt lgkmcnt(0)
505 ; SI-NEXT: s_load_dword s0, s[0:1], 0x1
506 ; SI-NEXT: s_mov_b32 s3, 0xf000
507 ; SI-NEXT: s_mov_b32 s2, -1
508 ; SI-NEXT: s_waitcnt lgkmcnt(0)
509 ; SI-NEXT: s_lshr_b32 s1, s0, 16
510 ; SI-NEXT: v_mov_b32_e32 v0, s0
511 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
512 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
513 ; SI-NEXT: v_mov_b32_e32 v0, s1
514 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
515 ; SI-NEXT: s_waitcnt vmcnt(0)
518 ; VI-LABEL: reduce_load_vector_v8f16_extract_23:
520 ; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
521 ; VI-NEXT: s_mov_b32 s3, 0xf000
522 ; VI-NEXT: s_mov_b32 s2, -1
523 ; VI-NEXT: s_waitcnt lgkmcnt(0)
524 ; VI-NEXT: s_load_dword s0, s[0:1], 0x4
525 ; VI-NEXT: s_waitcnt lgkmcnt(0)
526 ; VI-NEXT: s_lshr_b32 s1, s0, 16
527 ; VI-NEXT: v_mov_b32_e32 v0, s0
528 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
529 ; VI-NEXT: s_waitcnt vmcnt(0)
530 ; VI-NEXT: v_mov_b32_e32 v0, s1
531 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
532 ; VI-NEXT: s_waitcnt vmcnt(0)
535 ; GFX11-LABEL: reduce_load_vector_v8f16_extract_23:
537 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
538 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
539 ; GFX11-NEXT: s_mov_b32 s2, -1
540 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
541 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
542 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
544 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
545 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
546 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
547 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
548 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc
549 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
550 ; GFX11-NEXT: s_nop 0
551 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
552 ; GFX11-NEXT: s_endpgm
553 %load = load <16 x half>, ptr addrspace(4) %ptr
554 %elt2 = extractelement <16 x half> %load, i32 2
555 %elt3 = extractelement <16 x half> %load, i32 3
556 store volatile half %elt2, ptr addrspace(1) undef, align 2
557 store volatile half %elt3, ptr addrspace(1) undef, align 2
561 define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
562 ; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr:
564 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
565 ; SI-NEXT: s_load_dword s8, s[2:3], 0xd
566 ; SI-NEXT: s_mov_b32 s3, 0xf000
567 ; SI-NEXT: s_mov_b32 s2, 0
568 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
569 ; SI-NEXT: v_mov_b32_e32 v5, 0
570 ; SI-NEXT: s_waitcnt lgkmcnt(0)
571 ; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
572 ; SI-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64
573 ; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v0
574 ; SI-NEXT: v_mov_b32_e32 v7, v5
575 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
576 ; SI-NEXT: s_cmp_eq_u32 s8, 1
577 ; SI-NEXT: s_waitcnt vmcnt(0)
578 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
579 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
580 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v2
581 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
582 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v3
583 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
584 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v4
585 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
586 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
587 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
588 ; SI-NEXT: s_cmp_eq_u32 s8, 2
589 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
590 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
591 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
592 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
593 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
594 ; SI-NEXT: s_cmp_eq_u32 s8, 3
595 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
596 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
597 ; SI-NEXT: s_cmp_eq_u32 s8, 4
598 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
599 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
600 ; SI-NEXT: s_cmp_eq_u32 s8, 5
601 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
602 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
603 ; SI-NEXT: s_cmp_eq_u32 s8, 6
604 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
605 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
606 ; SI-NEXT: s_cmp_eq_u32 s8, 7
607 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
608 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
609 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
610 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
611 ; SI-NEXT: buffer_store_short v0, v[6:7], s[4:7], 0 addr64
614 ; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr:
616 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
617 ; VI-NEXT: s_load_dword s0, s[2:3], 0x34
618 ; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0
619 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
620 ; VI-NEXT: s_waitcnt lgkmcnt(0)
621 ; VI-NEXT: v_mov_b32_e32 v2, s7
622 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
623 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
624 ; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
625 ; VI-NEXT: v_mov_b32_e32 v6, s5
626 ; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0
627 ; VI-NEXT: s_cmp_eq_u32 s0, 1
628 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
629 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
630 ; VI-NEXT: s_cmp_eq_u32 s0, 2
631 ; VI-NEXT: s_waitcnt vmcnt(0)
632 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
633 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
634 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
635 ; VI-NEXT: s_cmp_eq_u32 s0, 3
636 ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
637 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
638 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
639 ; VI-NEXT: s_cmp_eq_u32 s0, 4
640 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
641 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
642 ; VI-NEXT: s_cmp_eq_u32 s0, 5
643 ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
644 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
645 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
646 ; VI-NEXT: s_cmp_eq_u32 s0, 6
647 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
648 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
649 ; VI-NEXT: s_cmp_eq_u32 s0, 7
650 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
651 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
652 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
653 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
654 ; VI-NEXT: flat_store_short v[5:6], v0
657 ; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr:
659 ; GFX11-NEXT: s_clause 0x1
660 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
661 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34
662 ; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0
663 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
664 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4
665 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7]
667 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1
668 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
669 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2
670 ; GFX11-NEXT: s_waitcnt vmcnt(0)
671 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
672 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
673 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
674 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
675 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
676 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3
677 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
678 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
679 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4
680 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
681 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
682 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
683 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
684 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5
685 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
686 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
687 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6
688 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4
689 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
690 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
691 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
692 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
693 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7
694 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
695 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
696 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
697 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
698 ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
699 ; GFX11-NEXT: s_nop 0
700 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
701 ; GFX11-NEXT: s_endpgm
702 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
703 %tid.ext = sext i32 %tid to i64
704 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
705 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
706 %vec = load <8 x half>, ptr addrspace(1) %in.gep
707 %vec.extract = extractelement <8 x half> %vec, i32 %n
708 store half %vec.extract, ptr addrspace(1) %out.gep
712 define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
713 ; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr:
715 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
716 ; SI-NEXT: s_load_dword s8, s[2:3], 0xd
717 ; SI-NEXT: s_mov_b32 s3, 0xf000
718 ; SI-NEXT: s_mov_b32 s2, 0
719 ; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0
720 ; SI-NEXT: v_mov_b32_e32 v6, 0
721 ; SI-NEXT: s_waitcnt lgkmcnt(0)
722 ; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
723 ; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64
724 ; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0
725 ; SI-NEXT: v_mov_b32_e32 v10, v6
726 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
727 ; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[0:3], 0 addr64 offset:16
728 ; SI-NEXT: s_cmp_eq_u32 s8, 1
729 ; SI-NEXT: s_waitcnt vmcnt(1)
730 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
731 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
732 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v2
733 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
734 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v3
735 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
736 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v4
737 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
738 ; SI-NEXT: s_waitcnt vmcnt(0)
739 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v5
740 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
741 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v6
742 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
743 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v7
744 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
745 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v8
746 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
747 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
748 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
749 ; SI-NEXT: s_cmp_eq_u32 s8, 2
750 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
751 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
752 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
753 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
754 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
755 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
756 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
757 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
758 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
759 ; SI-NEXT: s_cmp_eq_u32 s8, 3
760 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
761 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
762 ; SI-NEXT: s_cmp_eq_u32 s8, 4
763 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
764 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
765 ; SI-NEXT: s_cmp_eq_u32 s8, 5
766 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
767 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
768 ; SI-NEXT: s_cmp_eq_u32 s8, 6
769 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
770 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
771 ; SI-NEXT: s_cmp_eq_u32 s8, 7
772 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
773 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
774 ; SI-NEXT: s_cmp_eq_u32 s8, 8
775 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
776 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
777 ; SI-NEXT: s_cmp_eq_u32 s8, 9
778 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
779 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
780 ; SI-NEXT: s_cmp_eq_u32 s8, 10
781 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
782 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
783 ; SI-NEXT: s_cmp_eq_u32 s8, 11
784 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
785 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
786 ; SI-NEXT: s_cmp_eq_u32 s8, 12
787 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
788 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
789 ; SI-NEXT: s_cmp_eq_u32 s8, 13
790 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
791 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
792 ; SI-NEXT: s_cmp_eq_u32 s8, 14
793 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
794 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
795 ; SI-NEXT: s_cmp_eq_u32 s8, 15
796 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc
797 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
798 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
799 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
800 ; SI-NEXT: buffer_store_short v0, v[9:10], s[4:7], 0 addr64
803 ; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr:
805 ; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
806 ; VI-NEXT: s_load_dword s0, s[2:3], 0x34
807 ; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0
808 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
809 ; VI-NEXT: s_waitcnt lgkmcnt(0)
810 ; VI-NEXT: v_mov_b32_e32 v2, s7
811 ; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v1
812 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
813 ; VI-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
814 ; VI-NEXT: v_add_u32_e32 v5, vcc, 16, v5
815 ; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
816 ; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
817 ; VI-NEXT: v_mov_b32_e32 v10, s5
818 ; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v0
819 ; VI-NEXT: s_cmp_eq_u32 s0, 1
820 ; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
821 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
822 ; VI-NEXT: s_cmp_eq_u32 s0, 2
823 ; VI-NEXT: s_waitcnt vmcnt(1)
824 ; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
825 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
826 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
827 ; VI-NEXT: s_cmp_eq_u32 s0, 3
828 ; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
829 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
830 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
831 ; VI-NEXT: s_cmp_eq_u32 s0, 4
832 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
833 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
834 ; VI-NEXT: s_cmp_eq_u32 s0, 5
835 ; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3
836 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
837 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
838 ; VI-NEXT: s_cmp_eq_u32 s0, 6
839 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
840 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
841 ; VI-NEXT: s_cmp_eq_u32 s0, 7
842 ; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
843 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
844 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
845 ; VI-NEXT: s_cmp_eq_u32 s0, 8
846 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
847 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
848 ; VI-NEXT: s_cmp_eq_u32 s0, 9
849 ; VI-NEXT: s_waitcnt vmcnt(0)
850 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
851 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
852 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
853 ; VI-NEXT: s_cmp_eq_u32 s0, 10
854 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
855 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
856 ; VI-NEXT: s_cmp_eq_u32 s0, 11
857 ; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6
858 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
859 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
860 ; VI-NEXT: s_cmp_eq_u32 s0, 12
861 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
862 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
863 ; VI-NEXT: s_cmp_eq_u32 s0, 13
864 ; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7
865 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
866 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
867 ; VI-NEXT: s_cmp_eq_u32 s0, 14
868 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc
869 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
870 ; VI-NEXT: s_cmp_eq_u32 s0, 15
871 ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8
872 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
873 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
874 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc
875 ; VI-NEXT: flat_store_short v[9:10], v0
878 ; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr:
880 ; GFX11-NEXT: s_clause 0x1
881 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
882 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x34
883 ; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0
884 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
885 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8
886 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
887 ; GFX11-NEXT: s_clause 0x1
888 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
889 ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[6:7] offset:16
890 ; GFX11-NEXT: s_cmp_eq_u32 s0, 1
891 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
892 ; GFX11-NEXT: s_cmp_eq_u32 s0, 2
893 ; GFX11-NEXT: s_waitcnt vmcnt(1)
894 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
895 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
896 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
897 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
898 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1
899 ; GFX11-NEXT: s_cmp_eq_u32 s0, 3
900 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
901 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
902 ; GFX11-NEXT: s_cmp_eq_u32 s0, 4
903 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
904 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
905 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
906 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
907 ; GFX11-NEXT: s_cmp_eq_u32 s0, 5
908 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
909 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
910 ; GFX11-NEXT: s_cmp_eq_u32 s0, 6
911 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8
912 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
913 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
914 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
915 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
916 ; GFX11-NEXT: s_cmp_eq_u32 s0, 7
917 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
918 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
919 ; GFX11-NEXT: s_cmp_eq_u32 s0, 8
920 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
921 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
922 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
923 ; GFX11-NEXT: s_waitcnt vmcnt(0)
924 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4
925 ; GFX11-NEXT: s_cmp_eq_u32 s0, 9
926 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
927 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
928 ; GFX11-NEXT: s_cmp_eq_u32 s0, 10
929 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
930 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
931 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
932 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5
933 ; GFX11-NEXT: s_cmp_eq_u32 s0, 11
934 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
935 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
936 ; GFX11-NEXT: s_cmp_eq_u32 s0, 12
937 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
938 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
939 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
940 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6
941 ; GFX11-NEXT: s_cmp_eq_u32 s0, 13
942 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
943 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
944 ; GFX11-NEXT: s_cmp_eq_u32 s0, 14
945 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
946 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
947 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
948 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7
949 ; GFX11-NEXT: s_cmp_eq_u32 s0, 15
950 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
951 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
952 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
953 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
954 ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
955 ; GFX11-NEXT: s_nop 0
956 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
957 ; GFX11-NEXT: s_endpgm
958 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
959 %tid.ext = sext i32 %tid to i64
960 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
961 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
962 %vec = load <16 x half>, ptr addrspace(1) %in.gep
963 %vec.extract = extractelement <16 x half> %vec, i32 %n
964 store half %vec.extract, ptr addrspace(1) %out.gep
968 declare i32 @llvm.amdgcn.workitem.id.x() #1
970 attributes #0 = { nounwind }
971 attributes #1 = { nounwind readnone }