1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
6 ; half args should be promoted to float for CI and lower.
8 define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
9 ; CI-LABEL: load_f16_arg:
11 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
12 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
13 ; CI-NEXT: s_waitcnt lgkmcnt(0)
14 ; CI-NEXT: v_mov_b32_e32 v0, s0
15 ; CI-NEXT: v_mov_b32_e32 v1, s1
16 ; CI-NEXT: v_mov_b32_e32 v2, s2
17 ; CI-NEXT: flat_store_short v[0:1], v2
20 ; VI-LABEL: load_f16_arg:
22 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
23 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
24 ; VI-NEXT: s_waitcnt lgkmcnt(0)
25 ; VI-NEXT: v_mov_b32_e32 v0, s0
26 ; VI-NEXT: v_mov_b32_e32 v1, s1
27 ; VI-NEXT: v_mov_b32_e32 v2, s2
28 ; VI-NEXT: flat_store_short v[0:1], v2
31 ; GFX11-LABEL: load_f16_arg:
33 ; GFX11-NEXT: s_clause 0x1
34 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
35 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
36 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
38 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
40 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
41 ; GFX11-NEXT: s_endpgm
42 store half %arg, ptr addrspace(1) %out
46 define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
47 ; CI-LABEL: load_v2f16_arg:
49 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
50 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
51 ; CI-NEXT: s_waitcnt lgkmcnt(0)
52 ; CI-NEXT: v_mov_b32_e32 v0, s0
53 ; CI-NEXT: v_mov_b32_e32 v1, s1
54 ; CI-NEXT: v_mov_b32_e32 v2, s2
55 ; CI-NEXT: flat_store_dword v[0:1], v2
58 ; VI-LABEL: load_v2f16_arg:
60 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
61 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
62 ; VI-NEXT: s_waitcnt lgkmcnt(0)
63 ; VI-NEXT: v_mov_b32_e32 v0, s0
64 ; VI-NEXT: v_mov_b32_e32 v1, s1
65 ; VI-NEXT: v_mov_b32_e32 v2, s2
66 ; VI-NEXT: flat_store_dword v[0:1], v2
69 ; GFX11-LABEL: load_v2f16_arg:
71 ; GFX11-NEXT: s_clause 0x1
72 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
73 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
74 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
76 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
78 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
79 ; GFX11-NEXT: s_endpgm
80 store <2 x half> %arg, ptr addrspace(1) %out
84 define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
85 ; CIVI-LABEL: load_v3f16_arg:
87 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
88 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
89 ; CIVI-NEXT: s_add_u32 s4, s0, 4
90 ; CIVI-NEXT: s_addc_u32 s5, s1, 0
91 ; CIVI-NEXT: v_mov_b32_e32 v2, s4
92 ; CIVI-NEXT: v_mov_b32_e32 v4, s3
93 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
94 ; CIVI-NEXT: v_mov_b32_e32 v3, s5
95 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
96 ; CIVI-NEXT: v_mov_b32_e32 v5, s2
97 ; CIVI-NEXT: flat_store_short v[2:3], v4
98 ; CIVI-NEXT: flat_store_dword v[0:1], v5
101 ; GFX11-LABEL: load_v3f16_arg:
103 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
104 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
106 ; GFX11-NEXT: v_mov_b32_e32 v2, s2
107 ; GFX11-NEXT: s_clause 0x1
108 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4
109 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
110 ; GFX11-NEXT: s_nop 0
111 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
112 ; GFX11-NEXT: s_endpgm
113 store <3 x half> %arg, ptr addrspace(1) %out
118 ; FIXME: Why not one load?
119 define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
120 ; CIVI-LABEL: load_v4f16_arg:
122 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
123 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
124 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
125 ; CIVI-NEXT: v_mov_b32_e32 v2, s2
126 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
127 ; CIVI-NEXT: v_mov_b32_e32 v3, s3
128 ; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
129 ; CIVI-NEXT: s_endpgm
131 ; GFX11-LABEL: load_v4f16_arg:
133 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
134 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
135 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
137 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
138 ; GFX11-NEXT: s_nop 0
139 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
140 ; GFX11-NEXT: s_endpgm
141 store <4 x half> %arg, ptr addrspace(1) %out
145 define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
146 ; CI-LABEL: load_v8f16_arg:
148 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
149 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4
150 ; CI-NEXT: s_waitcnt lgkmcnt(0)
151 ; CI-NEXT: v_mov_b32_e32 v4, s4
152 ; CI-NEXT: v_mov_b32_e32 v0, s0
153 ; CI-NEXT: v_mov_b32_e32 v5, s5
154 ; CI-NEXT: v_mov_b32_e32 v1, s1
155 ; CI-NEXT: v_mov_b32_e32 v2, s2
156 ; CI-NEXT: v_mov_b32_e32 v3, s3
157 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
160 ; VI-LABEL: load_v8f16_arg:
162 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
163 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10
164 ; VI-NEXT: s_waitcnt lgkmcnt(0)
165 ; VI-NEXT: v_mov_b32_e32 v4, s4
166 ; VI-NEXT: v_mov_b32_e32 v0, s0
167 ; VI-NEXT: v_mov_b32_e32 v5, s5
168 ; VI-NEXT: v_mov_b32_e32 v1, s1
169 ; VI-NEXT: v_mov_b32_e32 v2, s2
170 ; VI-NEXT: v_mov_b32_e32 v3, s3
171 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
174 ; GFX11-LABEL: load_v8f16_arg:
176 ; GFX11-NEXT: s_clause 0x1
177 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10
178 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
179 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
180 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
182 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
183 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
184 ; GFX11-NEXT: s_nop 0
185 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
186 ; GFX11-NEXT: s_endpgm
187 store <8 x half> %arg, ptr addrspace(1) %out
191 define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 {
192 ; CI-LABEL: extload_v2f16_arg:
194 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
195 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
196 ; CI-NEXT: s_waitcnt lgkmcnt(0)
197 ; CI-NEXT: s_lshr_b32 s3, s2, 16
198 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
199 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
200 ; CI-NEXT: v_mov_b32_e32 v3, s1
201 ; CI-NEXT: v_mov_b32_e32 v2, s0
202 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
205 ; VI-LABEL: extload_v2f16_arg:
207 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
208 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
209 ; VI-NEXT: s_waitcnt lgkmcnt(0)
210 ; VI-NEXT: s_lshr_b32 s3, s2, 16
211 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
212 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
213 ; VI-NEXT: v_mov_b32_e32 v3, s1
214 ; VI-NEXT: v_mov_b32_e32 v2, s0
215 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
218 ; GFX11-LABEL: extload_v2f16_arg:
220 ; GFX11-NEXT: s_clause 0x1
221 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
222 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
223 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
224 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
225 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
226 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4
227 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
228 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
229 ; GFX11-NEXT: s_nop 0
230 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
231 ; GFX11-NEXT: s_endpgm
232 %fpext = fpext <2 x half> %in to <2 x float>
233 store <2 x float> %fpext, ptr addrspace(1) %out
237 define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 {
238 ; CI-LABEL: extload_f16_to_f32_arg:
240 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
241 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
242 ; CI-NEXT: s_waitcnt lgkmcnt(0)
243 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
244 ; CI-NEXT: v_mov_b32_e32 v0, s0
245 ; CI-NEXT: v_mov_b32_e32 v1, s1
246 ; CI-NEXT: flat_store_dword v[0:1], v2
249 ; VI-LABEL: extload_f16_to_f32_arg:
251 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
252 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
253 ; VI-NEXT: s_waitcnt lgkmcnt(0)
254 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
255 ; VI-NEXT: v_mov_b32_e32 v0, s0
256 ; VI-NEXT: v_mov_b32_e32 v1, s1
257 ; VI-NEXT: flat_store_dword v[0:1], v2
260 ; GFX11-LABEL: extload_f16_to_f32_arg:
262 ; GFX11-NEXT: s_clause 0x1
263 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
264 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
265 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
266 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
268 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
269 ; GFX11-NEXT: s_nop 0
270 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
271 ; GFX11-NEXT: s_endpgm
272 %ext = fpext half %arg to float
273 store float %ext, ptr addrspace(1) %out
277 define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
278 ; CI-LABEL: extload_v2f16_to_v2f32_arg:
280 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
281 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
282 ; CI-NEXT: s_waitcnt lgkmcnt(0)
283 ; CI-NEXT: s_lshr_b32 s3, s2, 16
284 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
285 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
286 ; CI-NEXT: v_mov_b32_e32 v3, s1
287 ; CI-NEXT: v_mov_b32_e32 v2, s0
288 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
291 ; VI-LABEL: extload_v2f16_to_v2f32_arg:
293 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
294 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
296 ; VI-NEXT: s_lshr_b32 s3, s2, 16
297 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
298 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
299 ; VI-NEXT: v_mov_b32_e32 v3, s1
300 ; VI-NEXT: v_mov_b32_e32 v2, s0
301 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
304 ; GFX11-LABEL: extload_v2f16_to_v2f32_arg:
306 ; GFX11-NEXT: s_clause 0x1
307 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
308 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
309 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
310 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
312 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4
313 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
314 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
315 ; GFX11-NEXT: s_nop 0
316 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
317 ; GFX11-NEXT: s_endpgm
318 %ext = fpext <2 x half> %arg to <2 x float>
319 store <2 x float> %ext, ptr addrspace(1) %out
323 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
324 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
326 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
327 ; CI-NEXT: s_waitcnt lgkmcnt(0)
328 ; CI-NEXT: s_lshr_b32 s4, s2, 16
329 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
330 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
331 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
332 ; CI-NEXT: v_mov_b32_e32 v4, s1
333 ; CI-NEXT: v_mov_b32_e32 v3, s0
334 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
337 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
339 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
340 ; VI-NEXT: s_waitcnt lgkmcnt(0)
341 ; VI-NEXT: s_lshr_b32 s4, s2, 16
342 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
343 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
344 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
345 ; VI-NEXT: v_mov_b32_e32 v4, s1
346 ; VI-NEXT: v_mov_b32_e32 v3, s0
347 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
350 ; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
352 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
353 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
354 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
355 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
356 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
357 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
358 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
359 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
360 ; GFX11-NEXT: s_nop 0
361 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
362 ; GFX11-NEXT: s_endpgm
363 %ext = fpext <3 x half> %arg to <3 x float>
364 store <3 x float> %ext, ptr addrspace(1) %out
368 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
369 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
371 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
372 ; CI-NEXT: s_waitcnt lgkmcnt(0)
373 ; CI-NEXT: s_lshr_b32 s4, s3, 16
374 ; CI-NEXT: s_lshr_b32 s5, s2, 16
375 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
376 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
377 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s5
378 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
379 ; CI-NEXT: v_mov_b32_e32 v5, s1
380 ; CI-NEXT: v_mov_b32_e32 v4, s0
381 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
384 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
386 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
387 ; VI-NEXT: s_waitcnt lgkmcnt(0)
388 ; VI-NEXT: s_lshr_b32 s4, s3, 16
389 ; VI-NEXT: s_lshr_b32 s5, s2, 16
390 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
391 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4
392 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5
393 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
394 ; VI-NEXT: v_mov_b32_e32 v5, s1
395 ; VI-NEXT: v_mov_b32_e32 v4, s0
396 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
399 ; GFX11-LABEL: extload_v4f16_to_v4f32_arg:
401 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
402 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
403 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
404 ; GFX11-NEXT: s_lshr_b32 s4, s3, 16
405 ; GFX11-NEXT: s_lshr_b32 s5, s2, 16
406 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
407 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4
408 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5
409 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
410 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
411 ; GFX11-NEXT: s_nop 0
412 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
413 ; GFX11-NEXT: s_endpgm
414 %ext = fpext <4 x half> %arg to <4 x float>
415 store <4 x float> %ext, ptr addrspace(1) %out
419 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
420 ; CI-LABEL: extload_v8f16_to_v8f32_arg:
422 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4
423 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
424 ; CI-NEXT: s_waitcnt lgkmcnt(0)
425 ; CI-NEXT: s_lshr_b32 s6, s1, 16
426 ; CI-NEXT: s_lshr_b32 s7, s0, 16
427 ; CI-NEXT: s_lshr_b32 s8, s3, 16
428 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s6
429 ; CI-NEXT: s_lshr_b32 s6, s2, 16
430 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s8
431 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6
432 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
433 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3
434 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
435 ; CI-NEXT: s_add_u32 s0, s4, 16
436 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
437 ; CI-NEXT: s_addc_u32 s1, s5, 0
438 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
439 ; CI-NEXT: v_mov_b32_e32 v9, s1
440 ; CI-NEXT: v_mov_b32_e32 v8, s0
441 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
443 ; CI-NEXT: v_mov_b32_e32 v4, s4
444 ; CI-NEXT: v_mov_b32_e32 v5, s5
445 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
448 ; VI-LABEL: extload_v8f16_to_v8f32_arg:
450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10
451 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
452 ; VI-NEXT: s_waitcnt lgkmcnt(0)
453 ; VI-NEXT: s_lshr_b32 s6, s1, 16
454 ; VI-NEXT: s_lshr_b32 s7, s0, 16
455 ; VI-NEXT: s_lshr_b32 s8, s3, 16
456 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s6
457 ; VI-NEXT: s_lshr_b32 s6, s2, 16
458 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8
459 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s6
460 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
461 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3
462 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
463 ; VI-NEXT: s_add_u32 s0, s4, 16
464 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
465 ; VI-NEXT: s_addc_u32 s1, s5, 0
466 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
467 ; VI-NEXT: v_mov_b32_e32 v9, s1
468 ; VI-NEXT: v_mov_b32_e32 v8, s0
469 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
471 ; VI-NEXT: v_mov_b32_e32 v4, s4
472 ; VI-NEXT: v_mov_b32_e32 v5, s5
473 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
476 ; GFX11-LABEL: extload_v8f16_to_v8f32_arg:
478 ; GFX11-NEXT: s_clause 0x1
479 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10
480 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
481 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
482 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX11-NEXT: s_lshr_b32 s8, s7, 16
484 ; GFX11-NEXT: s_lshr_b32 s9, s6, 16
485 ; GFX11-NEXT: s_lshr_b32 s2, s5, 16
486 ; GFX11-NEXT: s_lshr_b32 s3, s4, 16
487 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7
488 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, s6
489 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s8
490 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, s9
491 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5
492 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4
493 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s2
494 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3
495 ; GFX11-NEXT: s_clause 0x1
496 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
497 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
498 ; GFX11-NEXT: s_nop 0
499 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
500 ; GFX11-NEXT: s_endpgm
501 %ext = fpext <8 x half> %arg to <8 x float>
502 store <8 x float> %ext, ptr addrspace(1) %out
506 define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 {
507 ; CI-LABEL: extload_f16_to_f64_arg:
509 ; CI-NEXT: s_load_dword s0, s[6:7], 0x2
510 ; CI-NEXT: s_waitcnt lgkmcnt(0)
511 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
512 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
513 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
514 ; CI-NEXT: s_waitcnt lgkmcnt(0)
515 ; CI-NEXT: v_mov_b32_e32 v3, s1
516 ; CI-NEXT: v_mov_b32_e32 v2, s0
517 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
520 ; VI-LABEL: extload_f16_to_f64_arg:
522 ; VI-NEXT: s_load_dword s0, s[6:7], 0x8
523 ; VI-NEXT: s_waitcnt lgkmcnt(0)
524 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
525 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
526 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
528 ; VI-NEXT: v_mov_b32_e32 v3, s1
529 ; VI-NEXT: v_mov_b32_e32 v2, s0
530 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
533 ; GFX11-LABEL: extload_f16_to_f64_arg:
535 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8
536 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
537 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
539 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
540 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
541 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
542 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
544 ; GFX11-NEXT: s_nop 0
545 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
546 ; GFX11-NEXT: s_endpgm
547 %ext = fpext half %arg to double
548 store double %ext, ptr addrspace(1) %out
552 define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
553 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
555 ; CI-NEXT: s_load_dword s0, s[6:7], 0x2
556 ; CI-NEXT: s_waitcnt lgkmcnt(0)
557 ; CI-NEXT: s_lshr_b32 s1, s0, 16
558 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
559 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
560 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
561 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
562 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
563 ; CI-NEXT: s_waitcnt lgkmcnt(0)
564 ; CI-NEXT: v_mov_b32_e32 v5, s1
565 ; CI-NEXT: v_mov_b32_e32 v4, s0
566 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
569 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
571 ; VI-NEXT: s_load_dword s0, s[6:7], 0x8
572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
573 ; VI-NEXT: s_lshr_b32 s1, s0, 16
574 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
575 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0
576 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
577 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
578 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
579 ; VI-NEXT: s_waitcnt lgkmcnt(0)
580 ; VI-NEXT: v_mov_b32_e32 v5, s1
581 ; VI-NEXT: v_mov_b32_e32 v4, s0
582 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
585 ; GFX11-LABEL: extload_v2f16_to_v2f64_arg:
587 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x8
588 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
589 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
590 ; GFX11-NEXT: s_lshr_b32 s1, s0, 16
591 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
592 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
593 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
595 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
596 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
597 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
599 ; GFX11-NEXT: s_nop 0
600 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
601 ; GFX11-NEXT: s_endpgm
602 %ext = fpext <2 x half> %arg to <2 x double>
603 store <2 x double> %ext, ptr addrspace(1) %out
607 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
608 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
610 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
611 ; CI-NEXT: s_waitcnt lgkmcnt(0)
612 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
613 ; CI-NEXT: s_lshr_b32 s4, s2, 16
614 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
615 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
616 ; CI-NEXT: s_add_u32 s2, s0, 16
617 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
618 ; CI-NEXT: s_addc_u32 s3, s1, 0
619 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
620 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
621 ; CI-NEXT: v_mov_b32_e32 v7, s3
622 ; CI-NEXT: v_mov_b32_e32 v6, s2
623 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
624 ; CI-NEXT: v_mov_b32_e32 v5, s1
625 ; CI-NEXT: v_mov_b32_e32 v4, s0
626 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
629 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
631 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
632 ; VI-NEXT: s_waitcnt lgkmcnt(0)
633 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
634 ; VI-NEXT: s_lshr_b32 s4, s2, 16
635 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
636 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4
637 ; VI-NEXT: s_add_u32 s2, s0, 16
638 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
639 ; VI-NEXT: s_addc_u32 s3, s1, 0
640 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
641 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
642 ; VI-NEXT: v_mov_b32_e32 v7, s3
643 ; VI-NEXT: v_mov_b32_e32 v6, s2
644 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
645 ; VI-NEXT: v_mov_b32_e32 v5, s1
646 ; VI-NEXT: v_mov_b32_e32 v4, s0
647 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
650 ; GFX11-LABEL: extload_v3f16_to_v3f64_arg:
652 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
653 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
655 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3
656 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4
657 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s2
658 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
659 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
660 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v1
661 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
662 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v6
663 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
664 ; GFX11-NEXT: s_clause 0x1
665 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
666 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
667 ; GFX11-NEXT: s_nop 0
668 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
669 ; GFX11-NEXT: s_endpgm
670 %ext = fpext <3 x half> %arg to <3 x double>
671 store <3 x double> %ext, ptr addrspace(1) %out
675 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
676 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
678 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
679 ; CI-NEXT: s_waitcnt lgkmcnt(0)
680 ; CI-NEXT: s_lshr_b32 s4, s3, 16
681 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
682 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
683 ; CI-NEXT: s_lshr_b32 s5, s2, 16
684 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
685 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
686 ; CI-NEXT: s_add_u32 s2, s0, 16
687 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
688 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
689 ; CI-NEXT: s_addc_u32 s3, s1, 0
690 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
691 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
692 ; CI-NEXT: v_mov_b32_e32 v9, s3
693 ; CI-NEXT: v_mov_b32_e32 v8, s2
694 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
696 ; CI-NEXT: v_mov_b32_e32 v0, s0
697 ; CI-NEXT: v_mov_b32_e32 v1, s1
698 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
701 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
703 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
704 ; VI-NEXT: s_waitcnt lgkmcnt(0)
705 ; VI-NEXT: s_lshr_b32 s5, s3, 16
706 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
707 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
708 ; VI-NEXT: s_lshr_b32 s4, s2, 16
709 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
710 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
711 ; VI-NEXT: s_add_u32 s2, s0, 16
712 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
713 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
714 ; VI-NEXT: s_addc_u32 s3, s1, 0
715 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
716 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
717 ; VI-NEXT: v_mov_b32_e32 v9, s3
718 ; VI-NEXT: v_mov_b32_e32 v8, s2
719 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
721 ; VI-NEXT: v_mov_b32_e32 v0, s0
722 ; VI-NEXT: v_mov_b32_e32 v1, s1
723 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
726 ; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
728 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
729 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
731 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
732 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3
733 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s5
734 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
735 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, s4
736 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
737 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
738 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
739 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
740 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
741 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
742 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
743 ; GFX11-NEXT: s_clause 0x1
744 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
745 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
746 ; GFX11-NEXT: s_nop 0
747 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
748 ; GFX11-NEXT: s_endpgm
749 %ext = fpext <4 x half> %arg to <4 x double>
750 store <4 x double> %ext, ptr addrspace(1) %out
754 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
755 ; CI-LABEL: extload_v8f16_to_v8f64_arg:
757 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x4
758 ; CI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
759 ; CI-NEXT: s_waitcnt lgkmcnt(0)
760 ; CI-NEXT: s_lshr_b32 s6, s3, 16
761 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
762 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s3
763 ; CI-NEXT: s_lshr_b32 s7, s2, 16
764 ; CI-NEXT: s_lshr_b32 s8, s1, 16
765 ; CI-NEXT: s_lshr_b32 s6, s0, 16
766 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
767 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2
768 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0
769 ; CI-NEXT: s_add_u32 s0, s4, 48
770 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s1
771 ; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0
772 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
773 ; CI-NEXT: s_addc_u32 s1, s5, 0
774 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
775 ; CI-NEXT: v_mov_b32_e32 v17, s1
776 ; CI-NEXT: v_mov_b32_e32 v16, s0
777 ; CI-NEXT: s_add_u32 s0, s4, 32
778 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
779 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1
780 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
781 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
782 ; CI-NEXT: s_addc_u32 s1, s5, 0
783 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
784 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
785 ; CI-NEXT: v_mov_b32_e32 v13, s1
786 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
787 ; CI-NEXT: v_mov_b32_e32 v12, s0
788 ; CI-NEXT: s_add_u32 s0, s4, 16
789 ; CI-NEXT: s_addc_u32 s1, s5, 0
790 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
791 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
793 ; CI-NEXT: v_mov_b32_e32 v9, s1
794 ; CI-NEXT: v_mov_b32_e32 v8, s0
795 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
797 ; CI-NEXT: v_mov_b32_e32 v4, s4
798 ; CI-NEXT: v_mov_b32_e32 v5, s5
799 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
802 ; VI-LABEL: extload_v8f16_to_v8f64_arg:
804 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x10
805 ; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
806 ; VI-NEXT: s_waitcnt lgkmcnt(0)
807 ; VI-NEXT: s_lshr_b32 s6, s0, 16
808 ; VI-NEXT: s_lshr_b32 s8, s2, 16
809 ; VI-NEXT: s_lshr_b32 s9, s3, 16
810 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s6
811 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8
812 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
813 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3
814 ; VI-NEXT: s_lshr_b32 s7, s1, 16
815 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
816 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
817 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2
818 ; VI-NEXT: s_add_u32 s0, s4, 48
819 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4
820 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5
821 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
822 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
823 ; VI-NEXT: s_addc_u32 s1, s5, 0
824 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
825 ; VI-NEXT: v_mov_b32_e32 v17, s1
826 ; VI-NEXT: v_mov_b32_e32 v16, s0
827 ; VI-NEXT: s_add_u32 s0, s4, 32
828 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
829 ; VI-NEXT: s_addc_u32 s1, s5, 0
830 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
831 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
832 ; VI-NEXT: v_mov_b32_e32 v13, s1
833 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
834 ; VI-NEXT: v_mov_b32_e32 v12, s0
835 ; VI-NEXT: s_add_u32 s0, s4, 16
836 ; VI-NEXT: s_addc_u32 s1, s5, 0
837 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
838 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
840 ; VI-NEXT: v_mov_b32_e32 v9, s1
841 ; VI-NEXT: v_mov_b32_e32 v8, s0
842 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
844 ; VI-NEXT: v_mov_b32_e32 v4, s4
845 ; VI-NEXT: v_mov_b32_e32 v5, s5
846 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
849 ; GFX11-LABEL: extload_v8f16_to_v8f64_arg:
851 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x10
852 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
853 ; GFX11-NEXT: s_lshr_b32 s9, s7, 16
854 ; GFX11-NEXT: s_lshr_b32 s8, s6, 16
855 ; GFX11-NEXT: s_lshr_b32 s1, s5, 16
856 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7
857 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9
858 ; GFX11-NEXT: s_lshr_b32 s0, s4, 16
859 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6
860 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8
861 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5
862 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s1
863 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4
864 ; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s0
865 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6
866 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11
867 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3
868 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
869 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2
870 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
871 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
872 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16
873 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
874 ; GFX11-NEXT: v_mov_b32_e32 v16, 0
875 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
876 ; GFX11-NEXT: s_clause 0x3
877 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
878 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
879 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
880 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1]
881 ; GFX11-NEXT: s_nop 0
882 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
883 ; GFX11-NEXT: s_endpgm
884 %ext = fpext <8 x half> %arg to <8 x double>
885 store <8 x double> %ext, ptr addrspace(1) %out
889 define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
890 ; CIVI-LABEL: global_load_store_f16:
892 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
893 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
894 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
895 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
896 ; CIVI-NEXT: flat_load_ushort v2, v[0:1]
897 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
898 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
899 ; CIVI-NEXT: s_waitcnt vmcnt(0)
900 ; CIVI-NEXT: flat_store_short v[0:1], v2
901 ; CIVI-NEXT: s_endpgm
903 ; GFX11-LABEL: global_load_store_f16:
905 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
906 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
907 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
908 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
909 ; GFX11-NEXT: s_waitcnt vmcnt(0)
910 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
911 ; GFX11-NEXT: s_nop 0
912 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
913 ; GFX11-NEXT: s_endpgm
914 %val = load half, ptr addrspace(1) %in
915 store half %val, ptr addrspace(1) %out
919 define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
920 ; CIVI-LABEL: global_load_store_v2f16:
922 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
923 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
924 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
925 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
926 ; CIVI-NEXT: flat_load_dword v2, v[0:1]
927 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
928 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
929 ; CIVI-NEXT: s_waitcnt vmcnt(0)
930 ; CIVI-NEXT: flat_store_dword v[0:1], v2
931 ; CIVI-NEXT: s_endpgm
933 ; GFX11-LABEL: global_load_store_v2f16:
935 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
936 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
937 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
938 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
939 ; GFX11-NEXT: s_waitcnt vmcnt(0)
940 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
941 ; GFX11-NEXT: s_nop 0
942 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
943 ; GFX11-NEXT: s_endpgm
944 %val = load <2 x half>, ptr addrspace(1) %in
945 store <2 x half> %val, ptr addrspace(1) %out
949 define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
950 ; CIVI-LABEL: global_load_store_v4f16:
952 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
953 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
954 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
955 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
956 ; CIVI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
957 ; CIVI-NEXT: v_mov_b32_e32 v2, s2
958 ; CIVI-NEXT: v_mov_b32_e32 v3, s3
959 ; CIVI-NEXT: s_waitcnt vmcnt(0)
960 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
961 ; CIVI-NEXT: s_endpgm
963 ; GFX11-LABEL: global_load_store_v4f16:
965 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
966 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
967 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
968 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1]
969 ; GFX11-NEXT: s_waitcnt vmcnt(0)
970 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
971 ; GFX11-NEXT: s_nop 0
972 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
973 ; GFX11-NEXT: s_endpgm
974 %val = load <4 x half>, ptr addrspace(1) %in
975 store <4 x half> %val, ptr addrspace(1) %out
979 define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
980 ; CIVI-LABEL: global_load_store_v8f16:
982 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
983 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
984 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
985 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
986 ; CIVI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
987 ; CIVI-NEXT: v_mov_b32_e32 v4, s0
988 ; CIVI-NEXT: v_mov_b32_e32 v5, s1
989 ; CIVI-NEXT: s_waitcnt vmcnt(0)
990 ; CIVI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
991 ; CIVI-NEXT: s_endpgm
993 ; GFX11-LABEL: global_load_store_v8f16:
995 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
996 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
997 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
998 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
999 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1000 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1001 ; GFX11-NEXT: s_nop 0
1002 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1003 ; GFX11-NEXT: s_endpgm
1004 %val = load <8 x half>, ptr addrspace(1) %in
1005 store <8 x half> %val, ptr addrspace(1) %out
1009 define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1010 ; CIVI-LABEL: global_extload_f16_to_f32:
1012 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1013 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
1014 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
1015 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
1016 ; CIVI-NEXT: flat_load_ushort v0, v[0:1]
1017 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
1018 ; CIVI-NEXT: s_waitcnt vmcnt(0)
1019 ; CIVI-NEXT: v_cvt_f32_f16_e32 v2, v0
1020 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
1021 ; CIVI-NEXT: flat_store_dword v[0:1], v2
1022 ; CIVI-NEXT: s_endpgm
1024 ; GFX11-LABEL: global_extload_f16_to_f32:
1026 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1027 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1028 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1029 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
1030 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1031 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
1032 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1033 ; GFX11-NEXT: s_nop 0
1034 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1035 ; GFX11-NEXT: s_endpgm
1036 %val = load half, ptr addrspace(1) %in
1037 %cvt = fpext half %val to float
1038 store float %cvt, ptr addrspace(1) %out
1042 define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1043 ; CI-LABEL: global_extload_v2f16_to_v2f32:
1045 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1046 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1047 ; CI-NEXT: v_mov_b32_e32 v0, s2
1048 ; CI-NEXT: v_mov_b32_e32 v1, s3
1049 ; CI-NEXT: flat_load_dword v1, v[0:1]
1050 ; CI-NEXT: v_mov_b32_e32 v2, s0
1051 ; CI-NEXT: v_mov_b32_e32 v3, s1
1052 ; CI-NEXT: s_waitcnt vmcnt(0)
1053 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
1054 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1055 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1056 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1059 ; VI-LABEL: global_extload_v2f16_to_v2f32:
1061 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1062 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1063 ; VI-NEXT: v_mov_b32_e32 v0, s2
1064 ; VI-NEXT: v_mov_b32_e32 v1, s3
1065 ; VI-NEXT: flat_load_dword v1, v[0:1]
1066 ; VI-NEXT: v_mov_b32_e32 v2, s0
1067 ; VI-NEXT: v_mov_b32_e32 v3, s1
1068 ; VI-NEXT: s_waitcnt vmcnt(0)
1069 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v1
1070 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1071 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1074 ; GFX11-LABEL: global_extload_v2f16_to_v2f32:
1076 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1077 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1078 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1079 ; GFX11-NEXT: global_load_b32 v0, v2, s[2:3]
1080 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1081 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1082 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1083 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1084 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
1085 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1086 ; GFX11-NEXT: s_nop 0
1087 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1088 ; GFX11-NEXT: s_endpgm
1089 %val = load <2 x half>, ptr addrspace(1) %in
1090 %cvt = fpext <2 x half> %val to <2 x float>
1091 store <2 x float> %cvt, ptr addrspace(1) %out
1095 define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1096 ; CI-LABEL: global_extload_v3f16_to_v3f32:
1098 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1099 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1100 ; CI-NEXT: v_mov_b32_e32 v0, s2
1101 ; CI-NEXT: v_mov_b32_e32 v1, s3
1102 ; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
1103 ; CI-NEXT: v_mov_b32_e32 v3, s0
1104 ; CI-NEXT: v_mov_b32_e32 v4, s1
1105 ; CI-NEXT: s_waitcnt vmcnt(0)
1106 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
1107 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1108 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1109 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1110 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1113 ; VI-LABEL: global_extload_v3f16_to_v3f32:
1115 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1116 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1117 ; VI-NEXT: v_mov_b32_e32 v0, s2
1118 ; VI-NEXT: v_mov_b32_e32 v1, s3
1119 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
1120 ; VI-NEXT: v_mov_b32_e32 v3, s0
1121 ; VI-NEXT: v_mov_b32_e32 v4, s1
1122 ; VI-NEXT: s_waitcnt vmcnt(0)
1123 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v1
1124 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
1125 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1126 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
1129 ; GFX11-LABEL: global_extload_v3f16_to_v3f32:
1131 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1132 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1133 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1134 ; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3]
1135 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
1137 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1138 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
1139 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1140 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4
1141 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1]
1142 ; GFX11-NEXT: s_nop 0
1143 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1144 ; GFX11-NEXT: s_endpgm
1145 %val = load <3 x half>, ptr addrspace(1) %in
1146 %cvt = fpext <3 x half> %val to <3 x float>
1147 store <3 x float> %cvt, ptr addrspace(1) %out
1151 define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1152 ; CI-LABEL: global_extload_v4f16_to_v4f32:
1154 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1155 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1156 ; CI-NEXT: v_mov_b32_e32 v0, s2
1157 ; CI-NEXT: v_mov_b32_e32 v1, s3
1158 ; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
1159 ; CI-NEXT: v_mov_b32_e32 v5, s1
1160 ; CI-NEXT: s_waitcnt vmcnt(0)
1161 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v4
1162 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
1163 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
1164 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v3
1165 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v1
1166 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v4
1167 ; CI-NEXT: v_mov_b32_e32 v4, s0
1168 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1171 ; VI-LABEL: global_extload_v4f16_to_v4f32:
1173 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1174 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1175 ; VI-NEXT: v_mov_b32_e32 v0, s2
1176 ; VI-NEXT: v_mov_b32_e32 v1, s3
1177 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
1178 ; VI-NEXT: s_waitcnt vmcnt(0)
1179 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4
1180 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5
1181 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1182 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1183 ; VI-NEXT: v_mov_b32_e32 v4, s0
1184 ; VI-NEXT: v_mov_b32_e32 v5, s1
1185 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1188 ; GFX11-LABEL: global_extload_v4f16_to_v4f32:
1190 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1191 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
1192 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3]
1194 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1195 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
1196 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1197 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1198 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
1199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1200 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
1201 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v5
1202 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1203 ; GFX11-NEXT: s_nop 0
1204 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1205 ; GFX11-NEXT: s_endpgm
1206 %val = load <4 x half>, ptr addrspace(1) %in
1207 %cvt = fpext <4 x half> %val to <4 x float>
1208 store <4 x float> %cvt, ptr addrspace(1) %out
1212 define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1213 ; CI-LABEL: global_extload_v8f16_to_v8f32:
1215 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1216 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1217 ; CI-NEXT: v_mov_b32_e32 v0, s2
1218 ; CI-NEXT: v_mov_b32_e32 v1, s3
1219 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1220 ; CI-NEXT: s_add_u32 s2, s0, 16
1221 ; CI-NEXT: s_addc_u32 s3, s1, 0
1222 ; CI-NEXT: v_mov_b32_e32 v13, s1
1223 ; CI-NEXT: v_mov_b32_e32 v12, s0
1224 ; CI-NEXT: s_waitcnt vmcnt(0)
1225 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v3
1226 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
1227 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1228 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1229 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v1
1230 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v0
1231 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1232 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1233 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v3
1234 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v2
1235 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
1236 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0
1237 ; CI-NEXT: v_mov_b32_e32 v0, s2
1238 ; CI-NEXT: v_mov_b32_e32 v1, s3
1239 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1240 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
1243 ; VI-LABEL: global_extload_v8f16_to_v8f32:
1245 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1246 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1247 ; VI-NEXT: v_mov_b32_e32 v0, s2
1248 ; VI-NEXT: v_mov_b32_e32 v1, s3
1249 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1250 ; VI-NEXT: s_add_u32 s2, s0, 16
1251 ; VI-NEXT: s_addc_u32 s3, s1, 0
1252 ; VI-NEXT: v_mov_b32_e32 v13, s1
1253 ; VI-NEXT: v_mov_b32_e32 v12, s0
1254 ; VI-NEXT: s_waitcnt vmcnt(0)
1255 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v3
1256 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v2
1257 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1258 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1259 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v1
1260 ; VI-NEXT: v_cvt_f32_f16_e32 v4, v0
1261 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1262 ; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1263 ; VI-NEXT: v_mov_b32_e32 v0, s2
1264 ; VI-NEXT: v_mov_b32_e32 v1, s3
1265 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
1266 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
1269 ; GFX11-LABEL: global_extload_v8f16_to_v8f32:
1271 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1272 ; GFX11-NEXT: v_mov_b32_e32 v12, 0
1273 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1274 ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3]
1275 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1276 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3
1277 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2
1278 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1
1279 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0
1280 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1281 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1282 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v3
1283 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2
1284 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v5
1285 ; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9
1286 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v1
1287 ; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0
1288 ; GFX11-NEXT: s_clause 0x1
1289 ; GFX11-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:16
1290 ; GFX11-NEXT: global_store_b128 v12, v[4:7], s[0:1]
1291 ; GFX11-NEXT: s_nop 0
1292 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1293 ; GFX11-NEXT: s_endpgm
1294 %val = load <8 x half>, ptr addrspace(1) %in
1295 %cvt = fpext <8 x half> %val to <8 x float>
1296 store <8 x float> %cvt, ptr addrspace(1) %out
1300 define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1301 ; CI-LABEL: global_extload_v16f16_to_v16f32:
1303 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1304 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1305 ; CI-NEXT: s_add_u32 s4, s2, 16
1306 ; CI-NEXT: s_addc_u32 s5, s3, 0
1307 ; CI-NEXT: v_mov_b32_e32 v0, s4
1308 ; CI-NEXT: v_mov_b32_e32 v5, s3
1309 ; CI-NEXT: v_mov_b32_e32 v1, s5
1310 ; CI-NEXT: v_mov_b32_e32 v4, s2
1311 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1312 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1313 ; CI-NEXT: s_add_u32 s2, s0, 16
1314 ; CI-NEXT: s_addc_u32 s3, s1, 0
1315 ; CI-NEXT: s_waitcnt vmcnt(1)
1316 ; CI-NEXT: v_cvt_f32_f16_e32 v14, v3
1317 ; CI-NEXT: s_waitcnt vmcnt(0)
1318 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v7
1319 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v6
1320 ; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v7
1321 ; CI-NEXT: v_lshrrev_b32_e32 v25, 16, v6
1322 ; CI-NEXT: v_mov_b32_e32 v7, s3
1323 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1324 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
1325 ; CI-NEXT: v_mov_b32_e32 v6, s2
1326 ; CI-NEXT: s_add_u32 s2, s0, 48
1327 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v1
1328 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v0
1329 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1330 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1331 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v2
1332 ; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v2
1333 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5
1334 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4
1335 ; CI-NEXT: v_lshrrev_b32_e32 v24, 16, v4
1336 ; CI-NEXT: v_mov_b32_e32 v5, s1
1337 ; CI-NEXT: s_addc_u32 s3, s1, 0
1338 ; CI-NEXT: v_cvt_f32_f16_e32 v15, v3
1339 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v17
1340 ; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
1341 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v25
1342 ; CI-NEXT: v_mov_b32_e32 v4, s0
1343 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v1
1344 ; CI-NEXT: s_add_u32 s0, s0, 32
1345 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v24
1346 ; CI-NEXT: s_addc_u32 s1, s1, 0
1347 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
1348 ; CI-NEXT: v_mov_b32_e32 v21, s3
1349 ; CI-NEXT: v_mov_b32_e32 v23, s1
1350 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
1351 ; CI-NEXT: v_mov_b32_e32 v20, s2
1352 ; CI-NEXT: v_mov_b32_e32 v22, s0
1353 ; CI-NEXT: flat_store_dwordx4 v[6:7], v[16:19]
1354 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1355 ; CI-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
1356 ; CI-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
1359 ; VI-LABEL: global_extload_v16f16_to_v16f32:
1361 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1362 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1363 ; VI-NEXT: v_mov_b32_e32 v0, s2
1364 ; VI-NEXT: v_mov_b32_e32 v1, s3
1365 ; VI-NEXT: s_add_u32 s2, s2, 16
1366 ; VI-NEXT: s_addc_u32 s3, s3, 0
1367 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1368 ; VI-NEXT: v_mov_b32_e32 v5, s3
1369 ; VI-NEXT: v_mov_b32_e32 v4, s2
1370 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1371 ; VI-NEXT: s_add_u32 s2, s0, 16
1372 ; VI-NEXT: s_addc_u32 s3, s1, 0
1373 ; VI-NEXT: v_mov_b32_e32 v23, s3
1374 ; VI-NEXT: v_mov_b32_e32 v22, s2
1375 ; VI-NEXT: s_add_u32 s2, s0, 48
1376 ; VI-NEXT: v_mov_b32_e32 v21, s1
1377 ; VI-NEXT: s_addc_u32 s3, s1, 0
1378 ; VI-NEXT: v_mov_b32_e32 v20, s0
1379 ; VI-NEXT: s_add_u32 s0, s0, 32
1380 ; VI-NEXT: s_addc_u32 s1, s1, 0
1381 ; VI-NEXT: v_mov_b32_e32 v25, s3
1382 ; VI-NEXT: v_mov_b32_e32 v27, s1
1383 ; VI-NEXT: v_mov_b32_e32 v24, s2
1384 ; VI-NEXT: v_mov_b32_e32 v26, s0
1385 ; VI-NEXT: s_waitcnt vmcnt(1)
1386 ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3
1387 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2
1388 ; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1389 ; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1390 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1
1391 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0
1392 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1393 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1394 ; VI-NEXT: s_waitcnt vmcnt(0)
1395 ; VI-NEXT: v_cvt_f32_f16_e32 v18, v7
1396 ; VI-NEXT: v_cvt_f32_f16_e32 v16, v6
1397 ; VI-NEXT: v_cvt_f32_f16_sdwa v19, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1398 ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1399 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5
1400 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4
1401 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1402 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1403 ; VI-NEXT: flat_store_dwordx4 v[22:23], v[12:15]
1404 ; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
1405 ; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19]
1406 ; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
1409 ; GFX11-LABEL: global_extload_v16f16_to_v16f32:
1411 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1412 ; GFX11-NEXT: v_mov_b32_e32 v20, 0
1413 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1414 ; GFX11-NEXT: s_clause 0x1
1415 ; GFX11-NEXT: global_load_b128 v[0:3], v20, s[2:3]
1416 ; GFX11-NEXT: global_load_b128 v[4:7], v20, s[2:3] offset:16
1417 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1418 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1
1419 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1420 ; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v7
1421 ; GFX11-NEXT: v_cvt_f32_f16_e32 v16, v6
1422 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
1423 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
1424 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0
1425 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1426 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
1427 ; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v3
1428 ; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2
1429 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1430 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2
1431 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v5
1432 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v4
1433 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
1434 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
1435 ; GFX11-NEXT: v_cvt_f32_f16_e32 v19, v7
1436 ; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v6
1437 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v1
1438 ; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v3
1439 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v5
1440 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4
1441 ; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v13
1442 ; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9
1443 ; GFX11-NEXT: s_clause 0x3
1444 ; GFX11-NEXT: global_store_b128 v20, v[16:19], s[0:1] offset:48
1445 ; GFX11-NEXT: global_store_b128 v20, v[0:3], s[0:1] offset:32
1446 ; GFX11-NEXT: global_store_b128 v20, v[12:15], s[0:1] offset:16
1447 ; GFX11-NEXT: global_store_b128 v20, v[8:11], s[0:1]
1448 ; GFX11-NEXT: s_nop 0
1449 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1450 ; GFX11-NEXT: s_endpgm
1451 %val = load <16 x half>, ptr addrspace(1) %in
1452 %cvt = fpext <16 x half> %val to <16 x float>
1453 store <16 x float> %cvt, ptr addrspace(1) %out
1457 define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1458 ; CIVI-LABEL: global_extload_f16_to_f64:
1460 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1461 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
1462 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
1463 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
1464 ; CIVI-NEXT: flat_load_ushort v0, v[0:1]
1465 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
1466 ; CIVI-NEXT: v_mov_b32_e32 v3, s1
1467 ; CIVI-NEXT: s_waitcnt vmcnt(0)
1468 ; CIVI-NEXT: v_cvt_f32_f16_e32 v0, v0
1469 ; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1470 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1471 ; CIVI-NEXT: s_endpgm
1473 ; GFX11-LABEL: global_extload_f16_to_f64:
1475 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1476 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1477 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1478 ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
1479 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1480 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1481 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1482 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1483 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1484 ; GFX11-NEXT: s_nop 0
1485 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1486 ; GFX11-NEXT: s_endpgm
1487 %val = load half, ptr addrspace(1) %in
1488 %cvt = fpext half %val to double
1489 store double %cvt, ptr addrspace(1) %out
1493 define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1494 ; CI-LABEL: global_extload_v2f16_to_v2f64:
1496 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1497 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1498 ; CI-NEXT: v_mov_b32_e32 v0, s2
1499 ; CI-NEXT: v_mov_b32_e32 v1, s3
1500 ; CI-NEXT: flat_load_dword v0, v[0:1]
1501 ; CI-NEXT: v_mov_b32_e32 v4, s0
1502 ; CI-NEXT: v_mov_b32_e32 v5, s1
1503 ; CI-NEXT: s_waitcnt vmcnt(0)
1504 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1505 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1506 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
1507 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1508 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1509 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1512 ; VI-LABEL: global_extload_v2f16_to_v2f64:
1514 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1515 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1516 ; VI-NEXT: v_mov_b32_e32 v0, s2
1517 ; VI-NEXT: v_mov_b32_e32 v1, s3
1518 ; VI-NEXT: flat_load_dword v0, v[0:1]
1519 ; VI-NEXT: v_mov_b32_e32 v4, s0
1520 ; VI-NEXT: v_mov_b32_e32 v5, s1
1521 ; VI-NEXT: s_waitcnt vmcnt(0)
1522 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
1523 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1524 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
1525 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1526 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1529 ; GFX11-LABEL: global_extload_v2f16_to_v2f64:
1531 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1532 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
1533 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1534 ; GFX11-NEXT: global_load_b32 v0, v4, s[2:3]
1535 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1536 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1537 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1539 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
1540 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1541 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1542 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1543 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
1544 ; GFX11-NEXT: s_nop 0
1545 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1546 ; GFX11-NEXT: s_endpgm
1547 %val = load <2 x half>, ptr addrspace(1) %in
1548 %cvt = fpext <2 x half> %val to <2 x double>
1549 store <2 x double> %cvt, ptr addrspace(1) %out
1553 define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1554 ; CI-LABEL: global_extload_v3f16_to_v3f64:
1556 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1557 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1558 ; CI-NEXT: v_mov_b32_e32 v0, s2
1559 ; CI-NEXT: v_mov_b32_e32 v1, s3
1560 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1561 ; CI-NEXT: s_add_u32 s2, s0, 16
1562 ; CI-NEXT: s_addc_u32 s3, s1, 0
1563 ; CI-NEXT: v_mov_b32_e32 v7, s3
1564 ; CI-NEXT: v_mov_b32_e32 v6, s2
1565 ; CI-NEXT: s_waitcnt vmcnt(0)
1566 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1567 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1568 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1569 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1570 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
1571 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1572 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1573 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
1574 ; CI-NEXT: v_mov_b32_e32 v5, s1
1575 ; CI-NEXT: v_mov_b32_e32 v4, s0
1576 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1579 ; VI-LABEL: global_extload_v3f16_to_v3f64:
1581 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1582 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1583 ; VI-NEXT: v_mov_b32_e32 v0, s2
1584 ; VI-NEXT: v_mov_b32_e32 v1, s3
1585 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1586 ; VI-NEXT: s_add_u32 s2, s0, 16
1587 ; VI-NEXT: s_addc_u32 s3, s1, 0
1588 ; VI-NEXT: v_mov_b32_e32 v5, s1
1589 ; VI-NEXT: v_mov_b32_e32 v4, s0
1590 ; VI-NEXT: s_waitcnt vmcnt(0)
1591 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
1592 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v0
1593 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1594 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
1595 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
1596 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1597 ; VI-NEXT: v_mov_b32_e32 v9, s3
1598 ; VI-NEXT: v_mov_b32_e32 v8, s2
1599 ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7]
1600 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1603 ; GFX11-LABEL: global_extload_v3f16_to_v3f64:
1605 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1606 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
1607 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1608 ; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3]
1609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1610 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1611 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
1612 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1613 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1614 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
1615 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
1616 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1617 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1618 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1619 ; GFX11-NEXT: s_clause 0x1
1620 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
1621 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
1622 ; GFX11-NEXT: s_nop 0
1623 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1624 ; GFX11-NEXT: s_endpgm
1625 %val = load <3 x half>, ptr addrspace(1) %in
1626 %cvt = fpext <3 x half> %val to <3 x double>
1627 store <3 x double> %cvt, ptr addrspace(1) %out
1631 define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1632 ; CI-LABEL: global_extload_v4f16_to_v4f64:
1634 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1635 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1636 ; CI-NEXT: v_mov_b32_e32 v0, s2
1637 ; CI-NEXT: v_mov_b32_e32 v1, s3
1638 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1639 ; CI-NEXT: s_add_u32 s2, s0, 16
1640 ; CI-NEXT: s_addc_u32 s3, s1, 0
1641 ; CI-NEXT: v_mov_b32_e32 v9, s1
1642 ; CI-NEXT: v_mov_b32_e32 v8, s0
1643 ; CI-NEXT: s_waitcnt vmcnt(0)
1644 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
1645 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v1
1646 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
1647 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1648 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1649 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v0
1650 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
1651 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
1652 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
1653 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
1654 ; CI-NEXT: v_mov_b32_e32 v11, s3
1655 ; CI-NEXT: v_mov_b32_e32 v10, s2
1656 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
1657 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1660 ; VI-LABEL: global_extload_v4f16_to_v4f64:
1662 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1663 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1664 ; VI-NEXT: v_mov_b32_e32 v0, s2
1665 ; VI-NEXT: v_mov_b32_e32 v1, s3
1666 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1667 ; VI-NEXT: s_add_u32 s2, s0, 16
1668 ; VI-NEXT: s_addc_u32 s3, s1, 0
1669 ; VI-NEXT: v_mov_b32_e32 v9, s1
1670 ; VI-NEXT: v_mov_b32_e32 v8, s0
1671 ; VI-NEXT: s_waitcnt vmcnt(0)
1672 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
1673 ; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1674 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v0
1675 ; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1676 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
1677 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1678 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
1679 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
1680 ; VI-NEXT: v_mov_b32_e32 v11, s3
1681 ; VI-NEXT: v_mov_b32_e32 v10, s2
1682 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
1683 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1686 ; GFX11-LABEL: global_extload_v4f16_to_v4f64:
1688 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1689 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
1690 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1691 ; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3]
1692 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1693 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
1694 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
1695 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1
1696 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
1697 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1698 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
1699 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
1700 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1701 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
1702 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1703 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1704 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
1705 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
1706 ; GFX11-NEXT: s_clause 0x1
1707 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
1708 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
1709 ; GFX11-NEXT: s_nop 0
1710 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1711 ; GFX11-NEXT: s_endpgm
1712 %val = load <4 x half>, ptr addrspace(1) %in
1713 %cvt = fpext <4 x half> %val to <4 x double>
1714 store <4 x double> %cvt, ptr addrspace(1) %out
1718 define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1719 ; CI-LABEL: global_extload_v8f16_to_v8f64:
1721 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1722 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1723 ; CI-NEXT: v_mov_b32_e32 v0, s2
1724 ; CI-NEXT: v_mov_b32_e32 v1, s3
1725 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1726 ; CI-NEXT: s_add_u32 s2, s0, 48
1727 ; CI-NEXT: s_addc_u32 s3, s1, 0
1728 ; CI-NEXT: v_mov_b32_e32 v19, s3
1729 ; CI-NEXT: v_mov_b32_e32 v18, s2
1730 ; CI-NEXT: s_add_u32 s2, s0, 32
1731 ; CI-NEXT: v_mov_b32_e32 v17, s1
1732 ; CI-NEXT: s_addc_u32 s3, s1, 0
1733 ; CI-NEXT: v_mov_b32_e32 v16, s0
1734 ; CI-NEXT: s_add_u32 s0, s0, 16
1735 ; CI-NEXT: s_addc_u32 s1, s1, 0
1736 ; CI-NEXT: v_mov_b32_e32 v21, s3
1737 ; CI-NEXT: v_mov_b32_e32 v23, s1
1738 ; CI-NEXT: v_mov_b32_e32 v20, s2
1739 ; CI-NEXT: v_mov_b32_e32 v22, s0
1740 ; CI-NEXT: s_waitcnt vmcnt(0)
1741 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
1742 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
1743 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1744 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v4
1745 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1746 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
1747 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v5
1748 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
1749 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1750 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
1751 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1752 ; CI-NEXT: v_cvt_f32_f16_e32 v24, v1
1753 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v3
1754 ; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v10
1755 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
1756 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
1757 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
1758 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1759 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1760 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
1761 ; CI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
1762 ; CI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
1763 ; CI-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
1764 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1767 ; VI-LABEL: global_extload_v8f16_to_v8f64:
1769 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1770 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1771 ; VI-NEXT: v_mov_b32_e32 v0, s2
1772 ; VI-NEXT: v_mov_b32_e32 v1, s3
1773 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1774 ; VI-NEXT: s_add_u32 s2, s0, 48
1775 ; VI-NEXT: s_addc_u32 s3, s1, 0
1776 ; VI-NEXT: v_mov_b32_e32 v19, s3
1777 ; VI-NEXT: v_mov_b32_e32 v18, s2
1778 ; VI-NEXT: s_add_u32 s2, s0, 32
1779 ; VI-NEXT: v_mov_b32_e32 v17, s1
1780 ; VI-NEXT: s_addc_u32 s3, s1, 0
1781 ; VI-NEXT: v_mov_b32_e32 v16, s0
1782 ; VI-NEXT: s_add_u32 s0, s0, 16
1783 ; VI-NEXT: s_addc_u32 s1, s1, 0
1784 ; VI-NEXT: v_mov_b32_e32 v21, s3
1785 ; VI-NEXT: v_mov_b32_e32 v23, s1
1786 ; VI-NEXT: v_mov_b32_e32 v20, s2
1787 ; VI-NEXT: v_mov_b32_e32 v22, s0
1788 ; VI-NEXT: s_waitcnt vmcnt(0)
1789 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v3
1790 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1791 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v2
1792 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1793 ; VI-NEXT: v_cvt_f32_f16_e32 v4, v0
1794 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v1
1795 ; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1796 ; VI-NEXT: v_cvt_f32_f16_sdwa v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1797 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v10
1798 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
1799 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v7
1800 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
1801 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
1802 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
1803 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1804 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v24
1805 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
1806 ; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
1807 ; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7]
1808 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1811 ; GFX11-LABEL: global_extload_v8f16_to_v8f64:
1813 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
1814 ; GFX11-NEXT: v_mov_b32_e32 v16, 0
1815 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1816 ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3]
1817 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1818 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0
1819 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
1820 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1
1821 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3
1822 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2
1823 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1824 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1825 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
1826 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
1827 ; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v5
1828 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
1829 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v9
1830 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
1831 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7
1832 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v3
1833 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
1834 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v6
1835 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
1836 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
1837 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
1838 ; GFX11-NEXT: s_clause 0x3
1839 ; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
1840 ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
1841 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
1842 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1]
1843 ; GFX11-NEXT: s_nop 0
1844 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1845 ; GFX11-NEXT: s_endpgm
1846 %val = load <8 x half>, ptr addrspace(1) %in
1847 %cvt = fpext <8 x half> %val to <8 x double>
1848 store <8 x double> %cvt, ptr addrspace(1) %out
1852 define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1853 ; CI-LABEL: global_extload_v16f16_to_v16f64:
1855 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1856 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1857 ; CI-NEXT: v_mov_b32_e32 v0, s2
1858 ; CI-NEXT: v_mov_b32_e32 v1, s3
1859 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
1860 ; CI-NEXT: s_add_u32 s2, s2, 16
1861 ; CI-NEXT: s_addc_u32 s3, s3, 0
1862 ; CI-NEXT: v_mov_b32_e32 v0, s2
1863 ; CI-NEXT: v_mov_b32_e32 v1, s3
1864 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1865 ; CI-NEXT: s_add_u32 s2, s0, 48
1866 ; CI-NEXT: s_addc_u32 s3, s1, 0
1867 ; CI-NEXT: v_mov_b32_e32 v14, s3
1868 ; CI-NEXT: v_mov_b32_e32 v13, s2
1869 ; CI-NEXT: s_add_u32 s2, s0, 32
1870 ; CI-NEXT: s_addc_u32 s3, s1, 0
1871 ; CI-NEXT: v_mov_b32_e32 v16, s3
1872 ; CI-NEXT: v_mov_b32_e32 v15, s2
1873 ; CI-NEXT: s_add_u32 s2, s0, 16
1874 ; CI-NEXT: s_addc_u32 s3, s1, 0
1875 ; CI-NEXT: v_mov_b32_e32 v18, s3
1876 ; CI-NEXT: v_mov_b32_e32 v17, s2
1877 ; CI-NEXT: s_add_u32 s2, s0, 0x70
1878 ; CI-NEXT: s_addc_u32 s3, s1, 0
1879 ; CI-NEXT: v_mov_b32_e32 v12, s1
1880 ; CI-NEXT: v_mov_b32_e32 v11, s0
1881 ; CI-NEXT: s_waitcnt vmcnt(1)
1882 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v7
1883 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
1884 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v8
1885 ; CI-NEXT: v_lshrrev_b32_e32 v19, 16, v6
1886 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
1887 ; CI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
1888 ; CI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
1889 ; CI-NEXT: s_waitcnt vmcnt(0)
1890 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v0
1891 ; CI-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
1893 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v19
1894 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1895 ; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v3
1896 ; CI-NEXT: v_mov_b32_e32 v14, s3
1897 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
1898 ; CI-NEXT: v_mov_b32_e32 v13, s2
1899 ; CI-NEXT: s_add_u32 s2, s0, 0x60
1900 ; CI-NEXT: s_addc_u32 s3, s1, 0
1901 ; CI-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
1902 ; CI-NEXT: v_mov_b32_e32 v16, s3
1903 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
1904 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
1905 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
1906 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
1907 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v4
1908 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
1909 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1910 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
1911 ; CI-NEXT: v_mov_b32_e32 v15, s2
1912 ; CI-NEXT: s_add_u32 s2, s0, 0x50
1913 ; CI-NEXT: flat_store_dwordx4 v[17:18], v[4:7]
1914 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v1
1915 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v3
1916 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1917 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v2
1918 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1919 ; CI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
1920 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
1921 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1922 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
1923 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v5
1924 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
1925 ; CI-NEXT: flat_store_dwordx4 v[11:12], v[0:3]
1926 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v18
1927 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v4
1928 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1929 ; CI-NEXT: s_addc_u32 s3, s1, 0
1930 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
1931 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
1932 ; CI-NEXT: s_add_u32 s0, s0, 64
1933 ; CI-NEXT: flat_store_dwordx4 v[13:14], v[0:3]
1934 ; CI-NEXT: s_addc_u32 s1, s1, 0
1935 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v17
1936 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
1937 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21
1938 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12
1939 ; CI-NEXT: v_mov_b32_e32 v20, s3
1940 ; CI-NEXT: v_mov_b32_e32 v13, s1
1941 ; CI-NEXT: v_mov_b32_e32 v19, s2
1942 ; CI-NEXT: v_mov_b32_e32 v12, s0
1943 ; CI-NEXT: flat_store_dwordx4 v[15:16], v[8:11]
1944 ; CI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
1945 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
1948 ; VI-LABEL: global_extload_v16f16_to_v16f64:
1950 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
1951 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1952 ; VI-NEXT: v_mov_b32_e32 v0, s2
1953 ; VI-NEXT: v_mov_b32_e32 v1, s3
1954 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
1955 ; VI-NEXT: s_add_u32 s2, s2, 16
1956 ; VI-NEXT: s_addc_u32 s3, s3, 0
1957 ; VI-NEXT: v_mov_b32_e32 v0, s2
1958 ; VI-NEXT: v_mov_b32_e32 v1, s3
1959 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1960 ; VI-NEXT: s_add_u32 s2, s0, 48
1961 ; VI-NEXT: s_addc_u32 s3, s1, 0
1962 ; VI-NEXT: v_mov_b32_e32 v9, s3
1963 ; VI-NEXT: v_mov_b32_e32 v8, s2
1964 ; VI-NEXT: s_add_u32 s2, s0, 32
1965 ; VI-NEXT: s_addc_u32 s3, s1, 0
1966 ; VI-NEXT: v_mov_b32_e32 v13, s3
1967 ; VI-NEXT: v_mov_b32_e32 v12, s2
1968 ; VI-NEXT: s_add_u32 s2, s0, 16
1969 ; VI-NEXT: s_addc_u32 s3, s1, 0
1970 ; VI-NEXT: v_mov_b32_e32 v15, s3
1971 ; VI-NEXT: v_mov_b32_e32 v14, s2
1972 ; VI-NEXT: s_add_u32 s2, s0, 0x50
1973 ; VI-NEXT: s_addc_u32 s3, s1, 0
1974 ; VI-NEXT: v_mov_b32_e32 v17, s3
1975 ; VI-NEXT: v_mov_b32_e32 v16, s2
1976 ; VI-NEXT: s_add_u32 s2, s0, 64
1977 ; VI-NEXT: s_addc_u32 s3, s1, 0
1978 ; VI-NEXT: v_mov_b32_e32 v19, s3
1979 ; VI-NEXT: v_mov_b32_e32 v11, s1
1980 ; VI-NEXT: v_mov_b32_e32 v18, s2
1981 ; VI-NEXT: s_add_u32 s2, s0, 0x70
1982 ; VI-NEXT: v_mov_b32_e32 v10, s0
1983 ; VI-NEXT: s_addc_u32 s3, s1, 0
1984 ; VI-NEXT: s_add_u32 s0, s0, 0x60
1985 ; VI-NEXT: s_addc_u32 s1, s1, 0
1986 ; VI-NEXT: s_waitcnt vmcnt(1)
1987 ; VI-NEXT: v_cvt_f32_f16_e32 v22, v4
1988 ; VI-NEXT: v_cvt_f32_f16_sdwa v23, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1989 ; VI-NEXT: v_cvt_f32_f16_e32 v4, v7
1990 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1991 ; VI-NEXT: v_cvt_f32_f16_e32 v24, v5
1992 ; VI-NEXT: v_cvt_f32_f16_sdwa v25, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1993 ; VI-NEXT: v_cvt_f32_f16_e32 v20, v6
1994 ; VI-NEXT: v_cvt_f32_f16_sdwa v21, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1995 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
1996 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
1997 ; VI-NEXT: s_waitcnt vmcnt(0)
1998 ; VI-NEXT: v_cvt_f32_f16_e32 v26, v2
1999 ; VI-NEXT: v_cvt_f32_f16_sdwa v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2000 ; VI-NEXT: v_cvt_f32_f16_sdwa v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2001 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
2002 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v3
2003 ; VI-NEXT: v_cvt_f32_f16_e32 v29, v0
2004 ; VI-NEXT: v_cvt_f32_f16_sdwa v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2005 ; VI-NEXT: v_cvt_f32_f16_e32 v31, v1
2006 ; VI-NEXT: v_cvt_f32_f16_sdwa v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
2007 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v20
2008 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v21
2009 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v22
2010 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v23
2011 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
2012 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
2013 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v31
2014 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v24
2015 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v25
2016 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
2017 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v28
2018 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v29
2019 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
2020 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v32
2021 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v30
2022 ; VI-NEXT: v_mov_b32_e32 v21, s3
2023 ; VI-NEXT: v_mov_b32_e32 v23, s1
2024 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v26
2025 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v27
2026 ; VI-NEXT: v_mov_b32_e32 v20, s2
2027 ; VI-NEXT: v_mov_b32_e32 v22, s0
2028 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
2029 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
2030 ; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11]
2031 ; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
2034 ; GFX11-LABEL: global_extload_v16f16_to_v16f64:
2036 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2037 ; GFX11-NEXT: v_mov_b32_e32 v32, 0
2038 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2039 ; GFX11-NEXT: s_clause 0x1
2040 ; GFX11-NEXT: global_load_b128 v[0:3], v32, s[2:3]
2041 ; GFX11-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16
2042 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2043 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1
2044 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2045 ; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v5
2046 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1
2047 ; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v4
2048 ; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v7
2049 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7
2050 ; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v6
2051 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6
2052 ; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v3
2053 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2054 ; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2
2055 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2056 ; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v4
2057 ; GFX11-NEXT: v_cvt_f32_f16_e32 v22, v5
2058 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
2059 ; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v23
2060 ; GFX11-NEXT: v_cvt_f32_f16_e32 v34, v11
2061 ; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v19
2062 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2063 ; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7
2064 ; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v6
2065 ; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0
2066 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3
2067 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2
2068 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v22
2069 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v10
2070 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v18
2071 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v11
2072 ; GFX11-NEXT: v_cvt_f32_f16_e32 v33, v9
2073 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
2074 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v7
2075 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v14
2076 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v6
2077 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v8
2078 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v12
2079 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
2080 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v3
2081 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
2082 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v34
2083 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v33
2084 ; GFX11-NEXT: s_clause 0x7
2085 ; GFX11-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80
2086 ; GFX11-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64
2087 ; GFX11-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
2088 ; GFX11-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
2089 ; GFX11-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
2090 ; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
2091 ; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
2092 ; GFX11-NEXT: global_store_b128 v32, v[0:3], s[0:1]
2093 ; GFX11-NEXT: s_nop 0
2094 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2095 ; GFX11-NEXT: s_endpgm
2096 %val = load <16 x half>, ptr addrspace(1) %in
2097 %cvt = fpext <16 x half> %val to <16 x double>
2098 store <16 x double> %cvt, ptr addrspace(1) %out
2102 define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2103 ; CIVI-LABEL: global_truncstore_f32_to_f16:
2105 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2106 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
2107 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
2108 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
2109 ; CIVI-NEXT: flat_load_dword v0, v[0:1]
2110 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
2111 ; CIVI-NEXT: s_waitcnt vmcnt(0)
2112 ; CIVI-NEXT: v_cvt_f16_f32_e32 v2, v0
2113 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
2114 ; CIVI-NEXT: flat_store_short v[0:1], v2
2115 ; CIVI-NEXT: s_endpgm
2117 ; GFX11-LABEL: global_truncstore_f32_to_f16:
2119 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2120 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2121 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2122 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
2123 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2124 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
2125 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2126 ; GFX11-NEXT: s_nop 0
2127 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2128 ; GFX11-NEXT: s_endpgm
2129 %val = load float, ptr addrspace(1) %in
2130 %cvt = fptrunc float %val to half
2131 store half %cvt, ptr addrspace(1) %out
2135 define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2136 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
2138 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2139 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2140 ; CI-NEXT: v_mov_b32_e32 v0, s2
2141 ; CI-NEXT: v_mov_b32_e32 v1, s3
2142 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2143 ; CI-NEXT: s_waitcnt vmcnt(0)
2144 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
2145 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
2146 ; CI-NEXT: v_mov_b32_e32 v0, s0
2147 ; CI-NEXT: v_mov_b32_e32 v1, s1
2148 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2149 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
2150 ; CI-NEXT: flat_store_dword v[0:1], v2
2153 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
2155 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2156 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2157 ; VI-NEXT: v_mov_b32_e32 v0, s2
2158 ; VI-NEXT: v_mov_b32_e32 v1, s3
2159 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2160 ; VI-NEXT: s_waitcnt vmcnt(0)
2161 ; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2162 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v0
2163 ; VI-NEXT: v_mov_b32_e32 v0, s0
2164 ; VI-NEXT: v_mov_b32_e32 v1, s1
2165 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
2166 ; VI-NEXT: flat_store_dword v[0:1], v2
2169 ; GFX11-LABEL: global_truncstore_v2f32_to_v2f16:
2171 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2172 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2173 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2174 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
2175 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2176 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
2177 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2178 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2179 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2180 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
2181 ; GFX11-NEXT: s_nop 0
2182 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2183 ; GFX11-NEXT: s_endpgm
2184 %val = load <2 x float>, ptr addrspace(1) %in
2185 %cvt = fptrunc <2 x float> %val to <2 x half>
2186 store <2 x half> %cvt, ptr addrspace(1) %out
2190 define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2191 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
2193 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2194 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2195 ; CI-NEXT: v_mov_b32_e32 v0, s2
2196 ; CI-NEXT: v_mov_b32_e32 v1, s3
2197 ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
2198 ; CI-NEXT: s_add_u32 s2, s0, 4
2199 ; CI-NEXT: s_addc_u32 s3, s1, 0
2200 ; CI-NEXT: s_waitcnt vmcnt(0)
2201 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1
2202 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2203 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v0
2204 ; CI-NEXT: v_mov_b32_e32 v0, s2
2205 ; CI-NEXT: v_mov_b32_e32 v1, s3
2206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2207 ; CI-NEXT: flat_store_short v[0:1], v2
2208 ; CI-NEXT: v_mov_b32_e32 v0, s0
2209 ; CI-NEXT: v_or_b32_e32 v2, v4, v3
2210 ; CI-NEXT: v_mov_b32_e32 v1, s1
2211 ; CI-NEXT: flat_store_dword v[0:1], v2
2214 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
2216 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2217 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2218 ; VI-NEXT: v_mov_b32_e32 v0, s2
2219 ; VI-NEXT: v_mov_b32_e32 v1, s3
2220 ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
2221 ; VI-NEXT: s_add_u32 s2, s0, 4
2222 ; VI-NEXT: s_addc_u32 s3, s1, 0
2223 ; VI-NEXT: s_waitcnt vmcnt(0)
2224 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
2225 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2226 ; VI-NEXT: v_cvt_f16_f32_e32 v4, v0
2227 ; VI-NEXT: v_mov_b32_e32 v0, s2
2228 ; VI-NEXT: v_mov_b32_e32 v1, s3
2229 ; VI-NEXT: flat_store_short v[0:1], v2
2230 ; VI-NEXT: v_mov_b32_e32 v0, s0
2231 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
2232 ; VI-NEXT: v_mov_b32_e32 v1, s1
2233 ; VI-NEXT: flat_store_dword v[0:1], v3
2236 ; GFX11-LABEL: global_truncstore_v3f32_to_v3f16:
2238 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2239 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2240 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2241 ; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3]
2242 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2243 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
2244 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2245 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
2246 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2247 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
2248 ; GFX11-NEXT: s_clause 0x1
2249 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
2250 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
2251 ; GFX11-NEXT: s_nop 0
2252 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2253 ; GFX11-NEXT: s_endpgm
2254 %val = load <3 x float>, ptr addrspace(1) %in
2255 %cvt = fptrunc <3 x float> %val to <3 x half>
2256 store <3 x half> %cvt, ptr addrspace(1) %out
2260 define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2261 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
2263 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2264 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2265 ; CI-NEXT: v_mov_b32_e32 v0, s2
2266 ; CI-NEXT: v_mov_b32_e32 v1, s3
2267 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2268 ; CI-NEXT: v_mov_b32_e32 v4, s0
2269 ; CI-NEXT: v_mov_b32_e32 v5, s1
2270 ; CI-NEXT: s_waitcnt vmcnt(0)
2271 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2272 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2273 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2274 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2275 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2276 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v1
2277 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
2278 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
2279 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
2282 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
2284 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2285 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2286 ; VI-NEXT: v_mov_b32_e32 v0, s2
2287 ; VI-NEXT: v_mov_b32_e32 v1, s3
2288 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2289 ; VI-NEXT: s_waitcnt vmcnt(0)
2290 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2291 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
2292 ; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2293 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v0
2294 ; VI-NEXT: v_mov_b32_e32 v0, s0
2295 ; VI-NEXT: v_mov_b32_e32 v1, s1
2296 ; VI-NEXT: v_or_b32_e32 v3, v2, v3
2297 ; VI-NEXT: v_or_b32_e32 v2, v5, v4
2298 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
2301 ; GFX11-LABEL: global_truncstore_v4f32_to_v4f16:
2303 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2304 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2305 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
2307 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2308 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
2309 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
2310 ; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v1
2311 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2312 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2313 ; GFX11-NEXT: v_pack_b32_f16 v1, v2, v3
2314 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v5
2315 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
2316 ; GFX11-NEXT: s_nop 0
2317 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2318 ; GFX11-NEXT: s_endpgm
2319 %val = load <4 x float>, ptr addrspace(1) %in
2320 %cvt = fptrunc <4 x float> %val to <4 x half>
2321 store <4 x half> %cvt, ptr addrspace(1) %out
2325 define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2326 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
2328 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2329 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2330 ; CI-NEXT: v_mov_b32_e32 v0, s2
2331 ; CI-NEXT: v_mov_b32_e32 v1, s3
2332 ; CI-NEXT: s_add_u32 s2, s2, 16
2333 ; CI-NEXT: s_addc_u32 s3, s3, 0
2334 ; CI-NEXT: v_mov_b32_e32 v5, s3
2335 ; CI-NEXT: v_mov_b32_e32 v4, s2
2336 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2337 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2338 ; CI-NEXT: v_mov_b32_e32 v8, s0
2339 ; CI-NEXT: v_mov_b32_e32 v9, s1
2340 ; CI-NEXT: s_waitcnt vmcnt(1)
2341 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2342 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2343 ; CI-NEXT: s_waitcnt vmcnt(0)
2344 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2345 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2346 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2347 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2348 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
2349 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2350 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2351 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v1
2352 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
2353 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2354 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
2355 ; CI-NEXT: v_or_b32_e32 v0, v0, v10
2356 ; CI-NEXT: v_or_b32_e32 v3, v6, v7
2357 ; CI-NEXT: v_or_b32_e32 v2, v4, v5
2358 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2361 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
2363 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2364 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2365 ; VI-NEXT: v_mov_b32_e32 v0, s2
2366 ; VI-NEXT: v_mov_b32_e32 v1, s3
2367 ; VI-NEXT: s_add_u32 s2, s2, 16
2368 ; VI-NEXT: s_addc_u32 s3, s3, 0
2369 ; VI-NEXT: v_mov_b32_e32 v5, s3
2370 ; VI-NEXT: v_mov_b32_e32 v4, s2
2371 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2372 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2373 ; VI-NEXT: v_mov_b32_e32 v8, s0
2374 ; VI-NEXT: v_mov_b32_e32 v9, s1
2375 ; VI-NEXT: s_waitcnt vmcnt(1)
2376 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2377 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
2378 ; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2379 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
2380 ; VI-NEXT: s_waitcnt vmcnt(0)
2381 ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2382 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2383 ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2384 ; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
2385 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
2386 ; VI-NEXT: v_or_b32_e32 v0, v0, v10
2387 ; VI-NEXT: v_or_b32_e32 v3, v6, v7
2388 ; VI-NEXT: v_or_b32_e32 v2, v4, v5
2389 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
2392 ; GFX11-LABEL: global_truncstore_v8f32_to_v8f16:
2394 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2395 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
2396 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2397 ; GFX11-NEXT: s_clause 0x1
2398 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
2399 ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3]
2400 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2401 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
2402 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
2403 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
2404 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2405 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2406 ; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7
2407 ; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6
2408 ; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5
2409 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
2410 ; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3
2411 ; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1
2412 ; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7
2413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
2414 ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5
2415 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
2416 ; GFX11-NEXT: s_nop 0
2417 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2418 ; GFX11-NEXT: s_endpgm
2419 %val = load <8 x float>, ptr addrspace(1) %in
2420 %cvt = fptrunc <8 x float> %val to <8 x half>
2421 store <8 x half> %cvt, ptr addrspace(1) %out
2425 define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2426 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
2428 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2429 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2430 ; CI-NEXT: s_add_u32 s4, s2, 32
2431 ; CI-NEXT: s_addc_u32 s5, s3, 0
2432 ; CI-NEXT: v_mov_b32_e32 v0, s4
2433 ; CI-NEXT: v_mov_b32_e32 v1, s5
2434 ; CI-NEXT: s_add_u32 s4, s2, 48
2435 ; CI-NEXT: s_addc_u32 s5, s3, 0
2436 ; CI-NEXT: v_mov_b32_e32 v9, s3
2437 ; CI-NEXT: v_mov_b32_e32 v4, s4
2438 ; CI-NEXT: v_mov_b32_e32 v8, s2
2439 ; CI-NEXT: s_add_u32 s2, s2, 16
2440 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2441 ; CI-NEXT: v_mov_b32_e32 v5, s5
2442 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2443 ; CI-NEXT: s_addc_u32 s3, s3, 0
2444 ; CI-NEXT: v_mov_b32_e32 v13, s3
2445 ; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2446 ; CI-NEXT: v_mov_b32_e32 v12, s2
2447 ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
2448 ; CI-NEXT: s_add_u32 s2, s0, 16
2449 ; CI-NEXT: s_addc_u32 s3, s1, 0
2450 ; CI-NEXT: v_mov_b32_e32 v17, s3
2451 ; CI-NEXT: v_mov_b32_e32 v16, s2
2452 ; CI-NEXT: s_waitcnt vmcnt(3)
2453 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2454 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2455 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2456 ; CI-NEXT: s_waitcnt vmcnt(2)
2457 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2458 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2459 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2460 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
2461 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2462 ; CI-NEXT: s_waitcnt vmcnt(1)
2463 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
2464 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
2465 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
2466 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
2467 ; CI-NEXT: s_waitcnt vmcnt(0)
2468 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
2469 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
2470 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
2471 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
2472 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2473 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
2474 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
2475 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2476 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
2477 ; CI-NEXT: v_or_b32_e32 v0, v0, v18
2478 ; CI-NEXT: v_or_b32_e32 v3, v6, v2
2479 ; CI-NEXT: v_or_b32_e32 v2, v4, v5
2480 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v11
2481 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v9
2482 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v15
2483 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v13
2484 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
2486 ; CI-NEXT: v_or_b32_e32 v1, v10, v4
2487 ; CI-NEXT: v_or_b32_e32 v0, v8, v5
2488 ; CI-NEXT: v_mov_b32_e32 v5, s1
2489 ; CI-NEXT: v_or_b32_e32 v3, v14, v6
2490 ; CI-NEXT: v_or_b32_e32 v2, v12, v7
2491 ; CI-NEXT: v_mov_b32_e32 v4, s0
2492 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2495 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
2497 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2498 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2499 ; VI-NEXT: s_add_u32 s4, s2, 32
2500 ; VI-NEXT: s_addc_u32 s5, s3, 0
2501 ; VI-NEXT: v_mov_b32_e32 v0, s4
2502 ; VI-NEXT: v_mov_b32_e32 v1, s5
2503 ; VI-NEXT: s_add_u32 s4, s2, 48
2504 ; VI-NEXT: s_addc_u32 s5, s3, 0
2505 ; VI-NEXT: v_mov_b32_e32 v9, s3
2506 ; VI-NEXT: v_mov_b32_e32 v8, s2
2507 ; VI-NEXT: s_add_u32 s2, s2, 16
2508 ; VI-NEXT: v_mov_b32_e32 v4, s4
2509 ; VI-NEXT: s_addc_u32 s3, s3, 0
2510 ; VI-NEXT: v_mov_b32_e32 v5, s5
2511 ; VI-NEXT: v_mov_b32_e32 v13, s3
2512 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2513 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
2514 ; VI-NEXT: v_mov_b32_e32 v12, s2
2515 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
2516 ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
2517 ; VI-NEXT: s_add_u32 s2, s0, 16
2518 ; VI-NEXT: s_addc_u32 s3, s1, 0
2519 ; VI-NEXT: v_mov_b32_e32 v17, s3
2520 ; VI-NEXT: v_mov_b32_e32 v16, s2
2521 ; VI-NEXT: s_waitcnt vmcnt(3)
2522 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2523 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
2524 ; VI-NEXT: v_cvt_f16_f32_sdwa v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2525 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
2526 ; VI-NEXT: s_waitcnt vmcnt(2)
2527 ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2528 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
2529 ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2530 ; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
2531 ; VI-NEXT: s_waitcnt vmcnt(1)
2532 ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2533 ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10
2534 ; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2535 ; VI-NEXT: v_cvt_f16_f32_e32 v8, v8
2536 ; VI-NEXT: s_waitcnt vmcnt(0)
2537 ; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2538 ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14
2539 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2540 ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12
2541 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
2542 ; VI-NEXT: v_or_b32_e32 v0, v0, v18
2543 ; VI-NEXT: v_or_b32_e32 v3, v6, v7
2544 ; VI-NEXT: v_or_b32_e32 v2, v4, v5
2545 ; VI-NEXT: v_mov_b32_e32 v5, s1
2546 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
2547 ; VI-NEXT: v_mov_b32_e32 v4, s0
2548 ; VI-NEXT: v_or_b32_e32 v1, v10, v11
2549 ; VI-NEXT: v_or_b32_e32 v0, v8, v9
2550 ; VI-NEXT: v_or_b32_e32 v3, v14, v15
2551 ; VI-NEXT: v_or_b32_e32 v2, v12, v13
2552 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2555 ; GFX11-LABEL: global_truncstore_v16f32_to_v16f16:
2557 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2558 ; GFX11-NEXT: v_mov_b32_e32 v16, 0
2559 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2560 ; GFX11-NEXT: s_clause 0x3
2561 ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] offset:16
2562 ; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3]
2563 ; GFX11-NEXT: global_load_b128 v[8:11], v16, s[2:3] offset:48
2564 ; GFX11-NEXT: global_load_b128 v[12:15], v16, s[2:3] offset:32
2565 ; GFX11-NEXT: s_waitcnt vmcnt(3)
2566 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
2567 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
2568 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
2569 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
2570 ; GFX11-NEXT: s_waitcnt vmcnt(2)
2571 ; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7
2572 ; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6
2573 ; GFX11-NEXT: v_cvt_f16_f32_e32 v17, v5
2574 ; GFX11-NEXT: v_cvt_f16_f32_e32 v18, v4
2575 ; GFX11-NEXT: s_waitcnt vmcnt(1)
2576 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v11
2577 ; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v10
2578 ; GFX11-NEXT: v_cvt_f16_f32_e32 v9, v9
2579 ; GFX11-NEXT: v_cvt_f16_f32_e32 v8, v8
2580 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2581 ; GFX11-NEXT: v_cvt_f16_f32_e32 v10, v15
2582 ; GFX11-NEXT: v_cvt_f16_f32_e32 v11, v14
2583 ; GFX11-NEXT: v_cvt_f16_f32_e32 v13, v13
2584 ; GFX11-NEXT: v_cvt_f16_f32_e32 v12, v12
2585 ; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3
2586 ; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1
2587 ; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7
2588 ; GFX11-NEXT: v_pack_b32_f16 v7, v5, v4
2589 ; GFX11-NEXT: v_pack_b32_f16 v6, v8, v9
2590 ; GFX11-NEXT: v_pack_b32_f16 v5, v11, v10
2591 ; GFX11-NEXT: v_pack_b32_f16 v4, v12, v13
2592 ; GFX11-NEXT: v_pack_b32_f16 v0, v18, v17
2593 ; GFX11-NEXT: s_clause 0x1
2594 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
2595 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1]
2596 ; GFX11-NEXT: s_nop 0
2597 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2598 ; GFX11-NEXT: s_endpgm
2599 %val = load <16 x float>, ptr addrspace(1) %in
2600 %cvt = fptrunc <16 x float> %val to <16 x half>
2601 store <16 x half> %cvt, ptr addrspace(1) %out
2605 ; FIXME: Unsafe math should fold conversions away
2606 define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 {
2607 ; CI-LABEL: fadd_f16:
2609 ; CI-NEXT: s_load_dword s0, s[6:7], 0x2
2610 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2611 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
2612 ; CI-NEXT: s_lshr_b32 s0, s0, 16
2613 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
2614 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2615 ; CI-NEXT: v_add_f32_e32 v0, v0, v1
2616 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
2617 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2618 ; CI-NEXT: v_mov_b32_e32 v0, s0
2619 ; CI-NEXT: v_mov_b32_e32 v1, s1
2620 ; CI-NEXT: flat_store_short v[0:1], v2
2623 ; VI-LABEL: fadd_f16:
2625 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
2626 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2627 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2628 ; VI-NEXT: s_lshr_b32 s3, s2, 16
2629 ; VI-NEXT: v_mov_b32_e32 v0, s3
2630 ; VI-NEXT: v_add_f16_e32 v2, s2, v0
2631 ; VI-NEXT: v_mov_b32_e32 v0, s0
2632 ; VI-NEXT: v_mov_b32_e32 v1, s1
2633 ; VI-NEXT: flat_store_short v[0:1], v2
2636 ; GFX11-LABEL: fadd_f16:
2638 ; GFX11-NEXT: s_clause 0x1
2639 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
2640 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
2641 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2642 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2643 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
2644 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2645 ; GFX11-NEXT: v_add_f16_e64 v1, s4, s2
2646 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2647 ; GFX11-NEXT: s_nop 0
2648 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2649 ; GFX11-NEXT: s_endpgm
2650 %add = fadd half %a, %b
2651 store half %add, ptr addrspace(1) %out, align 4
2655 define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 {
2656 ; CI-LABEL: fadd_v2f16:
2658 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2659 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2660 ; CI-NEXT: s_lshr_b32 s4, s2, 16
2661 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
2662 ; CI-NEXT: s_lshr_b32 s2, s3, 16
2663 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
2664 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
2665 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s2
2666 ; CI-NEXT: v_add_f32_e32 v0, v0, v1
2667 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2668 ; CI-NEXT: v_add_f32_e32 v1, v2, v3
2669 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2670 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2671 ; CI-NEXT: v_or_b32_e32 v2, v0, v1
2672 ; CI-NEXT: v_mov_b32_e32 v0, s0
2673 ; CI-NEXT: v_mov_b32_e32 v1, s1
2674 ; CI-NEXT: flat_store_dword v[0:1], v2
2677 ; VI-LABEL: fadd_v2f16:
2679 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2680 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2681 ; VI-NEXT: s_lshr_b32 s4, s3, 16
2682 ; VI-NEXT: s_lshr_b32 s5, s2, 16
2683 ; VI-NEXT: v_mov_b32_e32 v0, s3
2684 ; VI-NEXT: v_mov_b32_e32 v1, s4
2685 ; VI-NEXT: v_mov_b32_e32 v2, s5
2686 ; VI-NEXT: v_add_f16_e32 v0, s2, v0
2687 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2688 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
2689 ; VI-NEXT: v_mov_b32_e32 v0, s0
2690 ; VI-NEXT: v_mov_b32_e32 v1, s1
2691 ; VI-NEXT: flat_store_dword v[0:1], v2
2694 ; GFX11-LABEL: fadd_v2f16:
2696 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2697 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2698 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2699 ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3
2700 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2701 ; GFX11-NEXT: s_nop 0
2702 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2703 ; GFX11-NEXT: s_endpgm
2704 %add = fadd <2 x half> %a, %b
2705 store <2 x half> %add, ptr addrspace(1) %out, align 8
2709 define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2710 ; CI-LABEL: fadd_v4f16:
2712 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2713 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2714 ; CI-NEXT: v_mov_b32_e32 v0, s2
2715 ; CI-NEXT: v_mov_b32_e32 v1, s3
2716 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2717 ; CI-NEXT: v_mov_b32_e32 v4, s0
2718 ; CI-NEXT: v_mov_b32_e32 v5, s1
2719 ; CI-NEXT: s_waitcnt vmcnt(0)
2720 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
2721 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
2722 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
2723 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
2724 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
2725 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
2726 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v3
2727 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
2728 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
2729 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
2730 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
2731 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
2732 ; CI-NEXT: v_add_f32_e32 v7, v7, v9
2733 ; CI-NEXT: v_add_f32_e32 v6, v6, v8
2734 ; CI-NEXT: v_add_f32_e32 v1, v1, v3
2735 ; CI-NEXT: v_add_f32_e32 v0, v0, v2
2736 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2737 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2738 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v7
2739 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
2740 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2741 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2742 ; CI-NEXT: v_or_b32_e32 v1, v2, v1
2743 ; CI-NEXT: v_or_b32_e32 v0, v3, v0
2744 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
2747 ; VI-LABEL: fadd_v4f16:
2749 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2750 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2751 ; VI-NEXT: v_mov_b32_e32 v0, s2
2752 ; VI-NEXT: v_mov_b32_e32 v1, s3
2753 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2754 ; VI-NEXT: v_mov_b32_e32 v4, s0
2755 ; VI-NEXT: v_mov_b32_e32 v5, s1
2756 ; VI-NEXT: s_waitcnt vmcnt(0)
2757 ; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2758 ; VI-NEXT: v_add_f16_e32 v1, v1, v3
2759 ; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2760 ; VI-NEXT: v_add_f16_e32 v0, v0, v2
2761 ; VI-NEXT: v_or_b32_e32 v1, v1, v6
2762 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
2763 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
2766 ; GFX11-LABEL: fadd_v4f16:
2768 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2769 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2770 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2771 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
2772 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2773 ; GFX11-NEXT: v_pk_add_f16 v1, v1, v3
2774 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2
2775 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
2776 ; GFX11-NEXT: s_nop 0
2777 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2778 ; GFX11-NEXT: s_endpgm
2779 %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1
2780 %a = load <4 x half>, ptr addrspace(1) %in, align 16
2781 %b = load <4 x half>, ptr addrspace(1) %b_ptr, align 16
2782 %result = fadd <4 x half> %a, %b
2783 store <4 x half> %result, ptr addrspace(1) %out, align 16
2787 define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
2788 ; CI-LABEL: fadd_v8f16:
2790 ; CI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x4
2791 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2792 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2793 ; CI-NEXT: s_lshr_b32 s2, s8, 16
2794 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
2795 ; CI-NEXT: s_lshr_b32 s2, s11, 16
2796 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s2
2797 ; CI-NEXT: s_lshr_b32 s2, s12, 16
2798 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2
2799 ; CI-NEXT: s_lshr_b32 s2, s13, 16
2800 ; CI-NEXT: s_lshr_b32 s3, s9, 16
2801 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s2
2802 ; CI-NEXT: s_lshr_b32 s2, s14, 16
2803 ; CI-NEXT: s_lshr_b32 s4, s10, 16
2804 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
2805 ; CI-NEXT: v_cvt_f32_f16_e32 v10, s2
2806 ; CI-NEXT: s_lshr_b32 s2, s15, 16
2807 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
2808 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
2809 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s9
2810 ; CI-NEXT: v_cvt_f32_f16_e32 v11, s2
2811 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s12
2812 ; CI-NEXT: v_cvt_f32_f16_e32 v13, s13
2813 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s10
2814 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s11
2815 ; CI-NEXT: v_cvt_f32_f16_e32 v14, s15
2816 ; CI-NEXT: v_cvt_f32_f16_e32 v15, s14
2817 ; CI-NEXT: v_add_f32_e32 v1, v1, v9
2818 ; CI-NEXT: v_add_f32_e32 v0, v0, v8
2819 ; CI-NEXT: v_add_f32_e32 v3, v3, v11
2820 ; CI-NEXT: v_add_f32_e32 v2, v2, v10
2821 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2822 ; CI-NEXT: v_add_f32_e32 v5, v5, v13
2823 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2824 ; CI-NEXT: v_add_f32_e32 v4, v4, v12
2825 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2826 ; CI-NEXT: v_add_f32_e32 v7, v7, v14
2827 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2828 ; CI-NEXT: v_add_f32_e32 v6, v6, v15
2829 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2830 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2831 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2832 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
2833 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2834 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2835 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2836 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2837 ; CI-NEXT: v_or_b32_e32 v1, v5, v1
2838 ; CI-NEXT: v_or_b32_e32 v0, v4, v0
2839 ; CI-NEXT: v_mov_b32_e32 v5, s1
2840 ; CI-NEXT: v_or_b32_e32 v3, v7, v3
2841 ; CI-NEXT: v_or_b32_e32 v2, v6, v2
2842 ; CI-NEXT: v_mov_b32_e32 v4, s0
2843 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2846 ; VI-LABEL: fadd_v8f16:
2848 ; VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x10
2849 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
2850 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2851 ; VI-NEXT: s_lshr_b32 s2, s15, 16
2852 ; VI-NEXT: s_lshr_b32 s3, s11, 16
2853 ; VI-NEXT: v_mov_b32_e32 v0, s15
2854 ; VI-NEXT: v_mov_b32_e32 v1, s2
2855 ; VI-NEXT: v_mov_b32_e32 v2, s3
2856 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2857 ; VI-NEXT: v_add_f16_e32 v0, s11, v0
2858 ; VI-NEXT: s_lshr_b32 s2, s14, 16
2859 ; VI-NEXT: s_lshr_b32 s3, s10, 16
2860 ; VI-NEXT: v_or_b32_e32 v3, v0, v1
2861 ; VI-NEXT: v_mov_b32_e32 v0, s2
2862 ; VI-NEXT: v_mov_b32_e32 v1, s3
2863 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2864 ; VI-NEXT: v_mov_b32_e32 v1, s14
2865 ; VI-NEXT: v_add_f16_e32 v1, s10, v1
2866 ; VI-NEXT: s_lshr_b32 s2, s13, 16
2867 ; VI-NEXT: s_lshr_b32 s3, s9, 16
2868 ; VI-NEXT: v_or_b32_e32 v2, v1, v0
2869 ; VI-NEXT: v_mov_b32_e32 v0, s2
2870 ; VI-NEXT: v_mov_b32_e32 v1, s3
2871 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2872 ; VI-NEXT: v_mov_b32_e32 v1, s13
2873 ; VI-NEXT: v_add_f16_e32 v1, s9, v1
2874 ; VI-NEXT: s_lshr_b32 s2, s12, 16
2875 ; VI-NEXT: s_lshr_b32 s3, s8, 16
2876 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
2877 ; VI-NEXT: v_mov_b32_e32 v0, s2
2878 ; VI-NEXT: v_mov_b32_e32 v4, s3
2879 ; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2880 ; VI-NEXT: v_mov_b32_e32 v4, s12
2881 ; VI-NEXT: v_add_f16_e32 v4, s8, v4
2882 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
2883 ; VI-NEXT: v_mov_b32_e32 v5, s1
2884 ; VI-NEXT: v_mov_b32_e32 v4, s0
2885 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2888 ; GFX11-LABEL: fadd_v8f16:
2890 ; GFX11-NEXT: s_clause 0x1
2891 ; GFX11-NEXT: s_load_b256 s[4:11], s[2:3], 0x10
2892 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
2893 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2894 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2895 ; GFX11-NEXT: v_pk_add_f16 v3, s7, s11
2896 ; GFX11-NEXT: v_pk_add_f16 v2, s6, s10
2897 ; GFX11-NEXT: v_pk_add_f16 v1, s5, s9
2898 ; GFX11-NEXT: v_pk_add_f16 v0, s4, s8
2899 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
2900 ; GFX11-NEXT: s_nop 0
2901 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2902 ; GFX11-NEXT: s_endpgm
2903 %add = fadd <8 x half> %a, %b
2904 store <8 x half> %add, ptr addrspace(1) %out, align 32
2908 define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
2909 ; CIVI-LABEL: test_bitcast_from_half:
2911 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2912 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
2913 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
2914 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
2915 ; CIVI-NEXT: flat_load_ushort v2, v[0:1]
2916 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
2917 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
2918 ; CIVI-NEXT: s_waitcnt vmcnt(0)
2919 ; CIVI-NEXT: flat_store_short v[0:1], v2
2920 ; CIVI-NEXT: s_endpgm
2922 ; GFX11-LABEL: test_bitcast_from_half:
2924 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2925 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2926 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2927 ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
2928 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2929 ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
2930 ; GFX11-NEXT: s_nop 0
2931 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2932 ; GFX11-NEXT: s_endpgm
2933 %val = load half, ptr addrspace(1) %in
2934 %val_int = bitcast half %val to i16
2935 store i16 %val_int, ptr addrspace(1) %out
2939 define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2940 ; CIVI-LABEL: test_bitcast_to_half:
2942 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
2943 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
2944 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
2945 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
2946 ; CIVI-NEXT: flat_load_ushort v2, v[0:1]
2947 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
2948 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
2949 ; CIVI-NEXT: s_waitcnt vmcnt(0)
2950 ; CIVI-NEXT: flat_store_short v[0:1], v2
2951 ; CIVI-NEXT: s_endpgm
2953 ; GFX11-LABEL: test_bitcast_to_half:
2955 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2956 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2957 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2958 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
2959 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2960 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2961 ; GFX11-NEXT: s_nop 0
2962 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2963 ; GFX11-NEXT: s_endpgm
2964 %val = load i16, ptr addrspace(1) %in
2965 %val_fp = bitcast i16 %val to half
2966 store half %val_fp, ptr addrspace(1) %out
2970 attributes #0 = { nounwind }