1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s
5 ; half args should be promoted to float for CI and lower.
7 define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
8 ; CI-LABEL: load_f16_arg:
10 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
11 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
12 ; CI-NEXT: s_waitcnt lgkmcnt(0)
13 ; CI-NEXT: v_mov_b32_e32 v0, s0
14 ; CI-NEXT: v_mov_b32_e32 v1, s1
15 ; CI-NEXT: v_mov_b32_e32 v2, s2
16 ; CI-NEXT: flat_store_short v[0:1], v2
19 ; VI-LABEL: load_f16_arg:
21 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
22 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
23 ; VI-NEXT: s_waitcnt lgkmcnt(0)
24 ; VI-NEXT: v_mov_b32_e32 v0, s0
25 ; VI-NEXT: v_mov_b32_e32 v1, s1
26 ; VI-NEXT: v_mov_b32_e32 v2, s2
27 ; VI-NEXT: flat_store_short v[0:1], v2
29 store half %arg, half addrspace(1)* %out
33 define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
34 ; CI-LABEL: load_v2f16_arg:
36 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
37 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
38 ; CI-NEXT: s_waitcnt lgkmcnt(0)
39 ; CI-NEXT: v_mov_b32_e32 v0, s0
40 ; CI-NEXT: v_mov_b32_e32 v1, s1
41 ; CI-NEXT: v_mov_b32_e32 v2, s2
42 ; CI-NEXT: flat_store_dword v[0:1], v2
45 ; VI-LABEL: load_v2f16_arg:
47 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
48 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
49 ; VI-NEXT: s_waitcnt lgkmcnt(0)
50 ; VI-NEXT: v_mov_b32_e32 v0, s0
51 ; VI-NEXT: v_mov_b32_e32 v1, s1
52 ; VI-NEXT: v_mov_b32_e32 v2, s2
53 ; VI-NEXT: flat_store_dword v[0:1], v2
55 store <2 x half> %arg, <2 x half> addrspace(1)* %out
59 define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
60 ; GCN-LABEL: load_v3f16_arg:
62 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
63 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
64 ; GCN-NEXT: s_add_u32 s4, s0, 4
65 ; GCN-NEXT: s_addc_u32 s5, s1, 0
66 ; GCN-NEXT: v_mov_b32_e32 v2, s4
67 ; GCN-NEXT: v_mov_b32_e32 v4, s3
68 ; GCN-NEXT: v_mov_b32_e32 v0, s0
69 ; GCN-NEXT: v_mov_b32_e32 v3, s5
70 ; GCN-NEXT: v_mov_b32_e32 v1, s1
71 ; GCN-NEXT: v_mov_b32_e32 v5, s2
72 ; GCN-NEXT: flat_store_short v[2:3], v4
73 ; GCN-NEXT: flat_store_dword v[0:1], v5
75 store <3 x half> %arg, <3 x half> addrspace(1)* %out
80 ; FIXME: Why not one load?
81 define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
82 ; GCN-LABEL: load_v4f16_arg:
84 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
85 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
86 ; GCN-NEXT: v_mov_b32_e32 v0, s0
87 ; GCN-NEXT: v_mov_b32_e32 v2, s2
88 ; GCN-NEXT: v_mov_b32_e32 v1, s1
89 ; GCN-NEXT: v_mov_b32_e32 v3, s3
90 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
92 store <4 x half> %arg, <4 x half> addrspace(1)* %out
96 define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
97 ; CI-LABEL: load_v8f16_arg:
99 ; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
100 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
101 ; CI-NEXT: s_waitcnt lgkmcnt(0)
102 ; CI-NEXT: v_mov_b32_e32 v4, s6
103 ; CI-NEXT: v_mov_b32_e32 v0, s0
104 ; CI-NEXT: v_mov_b32_e32 v5, s7
105 ; CI-NEXT: v_mov_b32_e32 v1, s1
106 ; CI-NEXT: v_mov_b32_e32 v2, s2
107 ; CI-NEXT: v_mov_b32_e32 v3, s3
108 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
111 ; VI-LABEL: load_v8f16_arg:
113 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
114 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
115 ; VI-NEXT: s_waitcnt lgkmcnt(0)
116 ; VI-NEXT: v_mov_b32_e32 v4, s6
117 ; VI-NEXT: v_mov_b32_e32 v0, s0
118 ; VI-NEXT: v_mov_b32_e32 v5, s7
119 ; VI-NEXT: v_mov_b32_e32 v1, s1
120 ; VI-NEXT: v_mov_b32_e32 v2, s2
121 ; VI-NEXT: v_mov_b32_e32 v3, s3
122 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
124 store <8 x half> %arg, <8 x half> addrspace(1)* %out
128 define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
129 ; CI-LABEL: extload_v2f16_arg:
131 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
132 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
133 ; CI-NEXT: s_waitcnt lgkmcnt(0)
134 ; CI-NEXT: s_lshr_b32 s3, s2, 16
135 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
136 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
137 ; CI-NEXT: v_mov_b32_e32 v3, s1
138 ; CI-NEXT: v_mov_b32_e32 v2, s0
139 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
142 ; VI-LABEL: extload_v2f16_arg:
144 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
145 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
146 ; VI-NEXT: s_waitcnt lgkmcnt(0)
147 ; VI-NEXT: s_lshr_b32 s3, s2, 16
148 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
149 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
150 ; VI-NEXT: v_mov_b32_e32 v3, s1
151 ; VI-NEXT: v_mov_b32_e32 v2, s0
152 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
154 %fpext = fpext <2 x half> %in to <2 x float>
155 store <2 x float> %fpext, <2 x float> addrspace(1)* %out
159 define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
160 ; CI-LABEL: extload_f16_to_f32_arg:
162 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
163 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
164 ; CI-NEXT: s_waitcnt lgkmcnt(0)
165 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
166 ; CI-NEXT: v_mov_b32_e32 v0, s0
167 ; CI-NEXT: v_mov_b32_e32 v1, s1
168 ; CI-NEXT: flat_store_dword v[0:1], v2
171 ; VI-LABEL: extload_f16_to_f32_arg:
173 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
174 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
175 ; VI-NEXT: s_waitcnt lgkmcnt(0)
176 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
177 ; VI-NEXT: v_mov_b32_e32 v0, s0
178 ; VI-NEXT: v_mov_b32_e32 v1, s1
179 ; VI-NEXT: flat_store_dword v[0:1], v2
181 %ext = fpext half %arg to float
182 store float %ext, float addrspace(1)* %out
186 define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
187 ; CI-LABEL: extload_v2f16_to_v2f32_arg:
189 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
190 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
191 ; CI-NEXT: s_waitcnt lgkmcnt(0)
192 ; CI-NEXT: s_lshr_b32 s3, s2, 16
193 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
194 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
195 ; CI-NEXT: v_mov_b32_e32 v3, s1
196 ; CI-NEXT: v_mov_b32_e32 v2, s0
197 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
200 ; VI-LABEL: extload_v2f16_to_v2f32_arg:
202 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
203 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
204 ; VI-NEXT: s_waitcnt lgkmcnt(0)
205 ; VI-NEXT: s_lshr_b32 s3, s2, 16
206 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
207 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
208 ; VI-NEXT: v_mov_b32_e32 v3, s1
209 ; VI-NEXT: v_mov_b32_e32 v2, s0
210 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
212 %ext = fpext <2 x half> %arg to <2 x float>
213 store <2 x float> %ext, <2 x float> addrspace(1)* %out
217 define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
218 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
220 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
221 ; CI-NEXT: s_waitcnt lgkmcnt(0)
222 ; CI-NEXT: s_lshr_b32 s4, s2, 16
223 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
224 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s4
225 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
226 ; CI-NEXT: v_mov_b32_e32 v4, s1
227 ; CI-NEXT: v_mov_b32_e32 v3, s0
228 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
231 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
233 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
234 ; VI-NEXT: s_waitcnt lgkmcnt(0)
235 ; VI-NEXT: s_lshr_b32 s4, s2, 16
236 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
237 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s4
238 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
239 ; VI-NEXT: v_mov_b32_e32 v4, s1
240 ; VI-NEXT: v_mov_b32_e32 v3, s0
241 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
243 %ext = fpext <3 x half> %arg to <3 x float>
244 store <3 x float> %ext, <3 x float> addrspace(1)* %out
248 define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
249 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
251 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
252 ; CI-NEXT: s_waitcnt lgkmcnt(0)
253 ; CI-NEXT: s_lshr_b32 s4, s3, 16
254 ; CI-NEXT: s_lshr_b32 s5, s2, 16
255 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
256 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
257 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s5
258 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
259 ; CI-NEXT: v_mov_b32_e32 v5, s1
260 ; CI-NEXT: v_mov_b32_e32 v4, s0
261 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
264 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
266 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
267 ; VI-NEXT: s_waitcnt lgkmcnt(0)
268 ; VI-NEXT: s_lshr_b32 s4, s3, 16
269 ; VI-NEXT: s_lshr_b32 s5, s2, 16
270 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
271 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4
272 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5
273 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
274 ; VI-NEXT: v_mov_b32_e32 v5, s1
275 ; VI-NEXT: v_mov_b32_e32 v4, s0
276 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
278 %ext = fpext <4 x half> %arg to <4 x float>
279 store <4 x float> %ext, <4 x float> addrspace(1)* %out
283 define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
284 ; CI-LABEL: extload_v8f16_to_v8f32_arg:
286 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
287 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
288 ; CI-NEXT: s_waitcnt lgkmcnt(0)
289 ; CI-NEXT: s_lshr_b32 s6, s1, 16
290 ; CI-NEXT: s_lshr_b32 s7, s0, 16
291 ; CI-NEXT: s_lshr_b32 s8, s3, 16
292 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s6
293 ; CI-NEXT: s_lshr_b32 s6, s2, 16
294 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s8
295 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s6
296 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
297 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s3
298 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
299 ; CI-NEXT: s_add_u32 s0, s4, 16
300 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
301 ; CI-NEXT: s_addc_u32 s1, s5, 0
302 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
303 ; CI-NEXT: v_mov_b32_e32 v9, s1
304 ; CI-NEXT: v_mov_b32_e32 v8, s0
305 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
307 ; CI-NEXT: v_mov_b32_e32 v4, s4
308 ; CI-NEXT: v_mov_b32_e32 v5, s5
309 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
312 ; VI-LABEL: extload_v8f16_to_v8f32_arg:
314 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
315 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
316 ; VI-NEXT: s_waitcnt lgkmcnt(0)
317 ; VI-NEXT: s_lshr_b32 s6, s1, 16
318 ; VI-NEXT: s_lshr_b32 s7, s0, 16
319 ; VI-NEXT: s_lshr_b32 s8, s3, 16
320 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s6
321 ; VI-NEXT: s_lshr_b32 s6, s2, 16
322 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8
323 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s6
324 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
325 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3
326 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
327 ; VI-NEXT: s_add_u32 s0, s4, 16
328 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
329 ; VI-NEXT: s_addc_u32 s1, s5, 0
330 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
331 ; VI-NEXT: v_mov_b32_e32 v9, s1
332 ; VI-NEXT: v_mov_b32_e32 v8, s0
333 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
335 ; VI-NEXT: v_mov_b32_e32 v4, s4
336 ; VI-NEXT: v_mov_b32_e32 v5, s5
337 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
339 %ext = fpext <8 x half> %arg to <8 x float>
340 store <8 x float> %ext, <8 x float> addrspace(1)* %out
344 define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
345 ; CI-LABEL: extload_f16_to_f64_arg:
347 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
348 ; CI-NEXT: s_waitcnt lgkmcnt(0)
349 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
350 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
351 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
352 ; CI-NEXT: s_waitcnt lgkmcnt(0)
353 ; CI-NEXT: v_mov_b32_e32 v3, s1
354 ; CI-NEXT: v_mov_b32_e32 v2, s0
355 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
358 ; VI-LABEL: extload_f16_to_f64_arg:
360 ; VI-NEXT: s_load_dword s0, s[4:5], 0x8
361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
362 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
363 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
364 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
365 ; VI-NEXT: s_waitcnt lgkmcnt(0)
366 ; VI-NEXT: v_mov_b32_e32 v3, s1
367 ; VI-NEXT: v_mov_b32_e32 v2, s0
368 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
370 %ext = fpext half %arg to double
371 store double %ext, double addrspace(1)* %out
375 define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
376 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
378 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
379 ; CI-NEXT: s_waitcnt lgkmcnt(0)
380 ; CI-NEXT: s_lshr_b32 s1, s0, 16
381 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
382 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
383 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
384 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
385 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
386 ; CI-NEXT: s_waitcnt lgkmcnt(0)
387 ; CI-NEXT: v_mov_b32_e32 v5, s1
388 ; CI-NEXT: v_mov_b32_e32 v4, s0
389 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
392 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
394 ; VI-NEXT: s_load_dword s0, s[4:5], 0x8
395 ; VI-NEXT: s_waitcnt lgkmcnt(0)
396 ; VI-NEXT: s_lshr_b32 s1, s0, 16
397 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
398 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s0
399 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
400 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
401 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
402 ; VI-NEXT: s_waitcnt lgkmcnt(0)
403 ; VI-NEXT: v_mov_b32_e32 v5, s1
404 ; VI-NEXT: v_mov_b32_e32 v4, s0
405 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
407 %ext = fpext <2 x half> %arg to <2 x double>
408 store <2 x double> %ext, <2 x double> addrspace(1)* %out
412 define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
413 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
415 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
416 ; CI-NEXT: s_waitcnt lgkmcnt(0)
417 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
418 ; CI-NEXT: s_lshr_b32 s4, s2, 16
419 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
420 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
421 ; CI-NEXT: s_add_u32 s2, s0, 16
422 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0
423 ; CI-NEXT: s_addc_u32 s3, s1, 0
424 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
425 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
426 ; CI-NEXT: v_mov_b32_e32 v7, s3
427 ; CI-NEXT: v_mov_b32_e32 v6, s2
428 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
429 ; CI-NEXT: v_mov_b32_e32 v5, s1
430 ; CI-NEXT: v_mov_b32_e32 v4, s0
431 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
434 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
436 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
437 ; VI-NEXT: s_waitcnt lgkmcnt(0)
438 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
439 ; VI-NEXT: s_lshr_b32 s4, s2, 16
440 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
441 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4
442 ; VI-NEXT: s_add_u32 s2, s0, 16
443 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
444 ; VI-NEXT: s_addc_u32 s3, s1, 0
445 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
446 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
447 ; VI-NEXT: v_mov_b32_e32 v7, s3
448 ; VI-NEXT: v_mov_b32_e32 v6, s2
449 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
450 ; VI-NEXT: v_mov_b32_e32 v5, s1
451 ; VI-NEXT: v_mov_b32_e32 v4, s0
452 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
454 %ext = fpext <3 x half> %arg to <3 x double>
455 store <3 x double> %ext, <3 x double> addrspace(1)* %out
459 define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
460 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
462 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
463 ; CI-NEXT: s_waitcnt lgkmcnt(0)
464 ; CI-NEXT: s_lshr_b32 s4, s3, 16
465 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
466 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
467 ; CI-NEXT: s_lshr_b32 s5, s2, 16
468 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
469 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s5
470 ; CI-NEXT: s_add_u32 s2, s0, 16
471 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
472 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
473 ; CI-NEXT: s_addc_u32 s3, s1, 0
474 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
475 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
476 ; CI-NEXT: v_mov_b32_e32 v9, s3
477 ; CI-NEXT: v_mov_b32_e32 v8, s2
478 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
480 ; CI-NEXT: v_mov_b32_e32 v0, s0
481 ; CI-NEXT: v_mov_b32_e32 v1, s1
482 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
485 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
487 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
488 ; VI-NEXT: s_waitcnt lgkmcnt(0)
489 ; VI-NEXT: s_lshr_b32 s5, s3, 16
490 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
491 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s5
492 ; VI-NEXT: s_lshr_b32 s4, s2, 16
493 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2
494 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s4
495 ; VI-NEXT: s_add_u32 s2, s0, 16
496 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
497 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
498 ; VI-NEXT: s_addc_u32 s3, s1, 0
499 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
500 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
501 ; VI-NEXT: v_mov_b32_e32 v9, s3
502 ; VI-NEXT: v_mov_b32_e32 v8, s2
503 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
505 ; VI-NEXT: v_mov_b32_e32 v0, s0
506 ; VI-NEXT: v_mov_b32_e32 v1, s1
507 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
509 %ext = fpext <4 x half> %arg to <4 x double>
510 store <4 x double> %ext, <4 x double> addrspace(1)* %out
514 define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
515 ; CI-LABEL: extload_v8f16_to_v8f64_arg:
517 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4
518 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
519 ; CI-NEXT: s_waitcnt lgkmcnt(0)
520 ; CI-NEXT: s_lshr_b32 s6, s3, 16
521 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
522 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s3
523 ; CI-NEXT: s_lshr_b32 s7, s2, 16
524 ; CI-NEXT: s_lshr_b32 s8, s1, 16
525 ; CI-NEXT: s_lshr_b32 s6, s0, 16
526 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s7
527 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2
528 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s0
529 ; CI-NEXT: s_add_u32 s0, s4, 48
530 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s1
531 ; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0
532 ; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
533 ; CI-NEXT: s_addc_u32 s1, s5, 0
534 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
535 ; CI-NEXT: v_mov_b32_e32 v17, s1
536 ; CI-NEXT: v_mov_b32_e32 v16, s0
537 ; CI-NEXT: s_add_u32 s0, s4, 32
538 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s6
539 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1
540 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
541 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
542 ; CI-NEXT: s_addc_u32 s1, s5, 0
543 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
544 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4
545 ; CI-NEXT: v_mov_b32_e32 v13, s1
546 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
547 ; CI-NEXT: v_mov_b32_e32 v12, s0
548 ; CI-NEXT: s_add_u32 s0, s4, 16
549 ; CI-NEXT: s_addc_u32 s1, s5, 0
550 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
551 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
553 ; CI-NEXT: v_mov_b32_e32 v9, s1
554 ; CI-NEXT: v_mov_b32_e32 v8, s0
555 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
557 ; CI-NEXT: v_mov_b32_e32 v4, s4
558 ; CI-NEXT: v_mov_b32_e32 v5, s5
559 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
562 ; VI-LABEL: extload_v8f16_to_v8f64_arg:
564 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
565 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
566 ; VI-NEXT: s_waitcnt lgkmcnt(0)
567 ; VI-NEXT: s_lshr_b32 s6, s0, 16
568 ; VI-NEXT: s_lshr_b32 s8, s2, 16
569 ; VI-NEXT: s_lshr_b32 s9, s3, 16
570 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s6
571 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8
572 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
573 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3
574 ; VI-NEXT: s_lshr_b32 s7, s1, 16
575 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
576 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
577 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2
578 ; VI-NEXT: s_add_u32 s0, s4, 48
579 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4
580 ; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5
581 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
582 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
583 ; VI-NEXT: s_addc_u32 s1, s5, 0
584 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s7
585 ; VI-NEXT: v_mov_b32_e32 v17, s1
586 ; VI-NEXT: v_mov_b32_e32 v16, s0
587 ; VI-NEXT: s_add_u32 s0, s4, 32
588 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
589 ; VI-NEXT: s_addc_u32 s1, s5, 0
590 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
591 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1
592 ; VI-NEXT: v_mov_b32_e32 v13, s1
593 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
594 ; VI-NEXT: v_mov_b32_e32 v12, s0
595 ; VI-NEXT: s_add_u32 s0, s4, 16
596 ; VI-NEXT: s_addc_u32 s1, s5, 0
597 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
598 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
600 ; VI-NEXT: v_mov_b32_e32 v9, s1
601 ; VI-NEXT: v_mov_b32_e32 v8, s0
602 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
604 ; VI-NEXT: v_mov_b32_e32 v4, s4
605 ; VI-NEXT: v_mov_b32_e32 v5, s5
606 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
608 %ext = fpext <8 x half> %arg to <8 x double>
609 store <8 x double> %ext, <8 x double> addrspace(1)* %out
613 define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
614 ; GCN-LABEL: global_load_store_f16:
616 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
617 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
618 ; GCN-NEXT: v_mov_b32_e32 v0, s2
619 ; GCN-NEXT: v_mov_b32_e32 v1, s3
620 ; GCN-NEXT: flat_load_ushort v2, v[0:1]
621 ; GCN-NEXT: v_mov_b32_e32 v0, s0
622 ; GCN-NEXT: v_mov_b32_e32 v1, s1
623 ; GCN-NEXT: s_waitcnt vmcnt(0)
624 ; GCN-NEXT: flat_store_short v[0:1], v2
626 %val = load half, half addrspace(1)* %in
627 store half %val, half addrspace(1)* %out
631 define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
632 ; GCN-LABEL: global_load_store_v2f16:
634 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
635 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
636 ; GCN-NEXT: v_mov_b32_e32 v0, s2
637 ; GCN-NEXT: v_mov_b32_e32 v1, s3
638 ; GCN-NEXT: flat_load_dword v2, v[0:1]
639 ; GCN-NEXT: v_mov_b32_e32 v0, s0
640 ; GCN-NEXT: v_mov_b32_e32 v1, s1
641 ; GCN-NEXT: s_waitcnt vmcnt(0)
642 ; GCN-NEXT: flat_store_dword v[0:1], v2
644 %val = load <2 x half>, <2 x half> addrspace(1)* %in
645 store <2 x half> %val, <2 x half> addrspace(1)* %out
649 define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
650 ; GCN-LABEL: global_load_store_v4f16:
652 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
653 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
654 ; GCN-NEXT: v_mov_b32_e32 v0, s0
655 ; GCN-NEXT: v_mov_b32_e32 v1, s1
656 ; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
657 ; GCN-NEXT: v_mov_b32_e32 v2, s2
658 ; GCN-NEXT: v_mov_b32_e32 v3, s3
659 ; GCN-NEXT: s_waitcnt vmcnt(0)
660 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
662 %val = load <4 x half>, <4 x half> addrspace(1)* %in
663 store <4 x half> %val, <4 x half> addrspace(1)* %out
667 define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
668 ; GCN-LABEL: global_load_store_v8f16:
670 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
671 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
672 ; GCN-NEXT: v_mov_b32_e32 v0, s2
673 ; GCN-NEXT: v_mov_b32_e32 v1, s3
674 ; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
675 ; GCN-NEXT: v_mov_b32_e32 v4, s0
676 ; GCN-NEXT: v_mov_b32_e32 v5, s1
677 ; GCN-NEXT: s_waitcnt vmcnt(0)
678 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
680 %val = load <8 x half>, <8 x half> addrspace(1)* %in
681 store <8 x half> %val, <8 x half> addrspace(1)* %out
685 define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
686 ; GCN-LABEL: global_extload_f16_to_f32:
688 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
689 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
690 ; GCN-NEXT: v_mov_b32_e32 v0, s2
691 ; GCN-NEXT: v_mov_b32_e32 v1, s3
692 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
693 ; GCN-NEXT: v_mov_b32_e32 v1, s1
694 ; GCN-NEXT: s_waitcnt vmcnt(0)
695 ; GCN-NEXT: v_cvt_f32_f16_e32 v2, v0
696 ; GCN-NEXT: v_mov_b32_e32 v0, s0
697 ; GCN-NEXT: flat_store_dword v[0:1], v2
699 %val = load half, half addrspace(1)* %in
700 %cvt = fpext half %val to float
701 store float %cvt, float addrspace(1)* %out
705 define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
706 ; CI-LABEL: global_extload_v2f16_to_v2f32:
708 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
709 ; CI-NEXT: s_waitcnt lgkmcnt(0)
710 ; CI-NEXT: v_mov_b32_e32 v0, s2
711 ; CI-NEXT: v_mov_b32_e32 v1, s3
712 ; CI-NEXT: flat_load_dword v1, v[0:1]
713 ; CI-NEXT: v_mov_b32_e32 v2, s0
714 ; CI-NEXT: v_mov_b32_e32 v3, s1
715 ; CI-NEXT: s_waitcnt vmcnt(0)
716 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
717 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
718 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
719 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
722 ; VI-LABEL: global_extload_v2f16_to_v2f32:
724 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
725 ; VI-NEXT: s_waitcnt lgkmcnt(0)
726 ; VI-NEXT: v_mov_b32_e32 v0, s2
727 ; VI-NEXT: v_mov_b32_e32 v1, s3
728 ; VI-NEXT: flat_load_dword v1, v[0:1]
729 ; VI-NEXT: v_mov_b32_e32 v2, s0
730 ; VI-NEXT: v_mov_b32_e32 v3, s1
731 ; VI-NEXT: s_waitcnt vmcnt(0)
732 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v1
733 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
734 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
736 %val = load <2 x half>, <2 x half> addrspace(1)* %in
737 %cvt = fpext <2 x half> %val to <2 x float>
738 store <2 x float> %cvt, <2 x float> addrspace(1)* %out
742 define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
743 ; CI-LABEL: global_extload_v3f16_to_v3f32:
745 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
746 ; CI-NEXT: s_waitcnt lgkmcnt(0)
747 ; CI-NEXT: v_mov_b32_e32 v0, s2
748 ; CI-NEXT: v_mov_b32_e32 v1, s3
749 ; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
750 ; CI-NEXT: v_mov_b32_e32 v3, s0
751 ; CI-NEXT: v_mov_b32_e32 v4, s1
752 ; CI-NEXT: s_waitcnt vmcnt(0)
753 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1
754 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
755 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
756 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
757 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
760 ; VI-LABEL: global_extload_v3f16_to_v3f32:
762 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
763 ; VI-NEXT: s_waitcnt lgkmcnt(0)
764 ; VI-NEXT: v_mov_b32_e32 v0, s2
765 ; VI-NEXT: v_mov_b32_e32 v1, s3
766 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
767 ; VI-NEXT: v_mov_b32_e32 v3, s0
768 ; VI-NEXT: v_mov_b32_e32 v4, s1
769 ; VI-NEXT: s_waitcnt vmcnt(0)
770 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v1
771 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
772 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
773 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
775 %val = load <3 x half>, <3 x half> addrspace(1)* %in
776 %cvt = fpext <3 x half> %val to <3 x float>
777 store <3 x float> %cvt, <3 x float> addrspace(1)* %out
781 define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
782 ; CI-LABEL: global_extload_v4f16_to_v4f32:
784 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
785 ; CI-NEXT: s_waitcnt lgkmcnt(0)
786 ; CI-NEXT: v_mov_b32_e32 v0, s2
787 ; CI-NEXT: v_mov_b32_e32 v1, s3
788 ; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
789 ; CI-NEXT: v_mov_b32_e32 v5, s1
790 ; CI-NEXT: s_waitcnt vmcnt(0)
791 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v4
792 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
793 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
794 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v3
795 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v1
796 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v4
797 ; CI-NEXT: v_mov_b32_e32 v4, s0
798 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
801 ; VI-LABEL: global_extload_v4f16_to_v4f32:
803 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
804 ; VI-NEXT: s_waitcnt lgkmcnt(0)
805 ; VI-NEXT: v_mov_b32_e32 v0, s2
806 ; VI-NEXT: v_mov_b32_e32 v1, s3
807 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
808 ; VI-NEXT: s_waitcnt vmcnt(0)
809 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4
810 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5
811 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
812 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
813 ; VI-NEXT: v_mov_b32_e32 v4, s0
814 ; VI-NEXT: v_mov_b32_e32 v5, s1
815 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
817 %val = load <4 x half>, <4 x half> addrspace(1)* %in
818 %cvt = fpext <4 x half> %val to <4 x float>
819 store <4 x float> %cvt, <4 x float> addrspace(1)* %out
823 define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
824 ; CI-LABEL: global_extload_v8f16_to_v8f32:
826 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
827 ; CI-NEXT: s_waitcnt lgkmcnt(0)
828 ; CI-NEXT: v_mov_b32_e32 v0, s2
829 ; CI-NEXT: v_mov_b32_e32 v1, s3
830 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
831 ; CI-NEXT: s_add_u32 s2, s0, 16
832 ; CI-NEXT: s_addc_u32 s3, s1, 0
833 ; CI-NEXT: v_mov_b32_e32 v13, s1
834 ; CI-NEXT: v_mov_b32_e32 v12, s0
835 ; CI-NEXT: s_waitcnt vmcnt(0)
836 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v3
837 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
838 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
839 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
840 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v1
841 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v0
842 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
843 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
844 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v3
845 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v2
846 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
847 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0
848 ; CI-NEXT: v_mov_b32_e32 v0, s2
849 ; CI-NEXT: v_mov_b32_e32 v1, s3
850 ; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
851 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
854 ; VI-LABEL: global_extload_v8f16_to_v8f32:
856 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
857 ; VI-NEXT: s_waitcnt lgkmcnt(0)
858 ; VI-NEXT: v_mov_b32_e32 v0, s2
859 ; VI-NEXT: v_mov_b32_e32 v1, s3
860 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
861 ; VI-NEXT: s_add_u32 s2, s0, 16
862 ; VI-NEXT: s_addc_u32 s3, s1, 0
863 ; VI-NEXT: v_mov_b32_e32 v13, s1
864 ; VI-NEXT: v_mov_b32_e32 v12, s0
865 ; VI-NEXT: s_waitcnt vmcnt(0)
866 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v3
867 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v2
868 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
869 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
870 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v1
871 ; VI-NEXT: v_cvt_f32_f16_e32 v4, v0
872 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
873 ; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
874 ; VI-NEXT: v_mov_b32_e32 v0, s2
875 ; VI-NEXT: v_mov_b32_e32 v1, s3
876 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
877 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
879 %val = load <8 x half>, <8 x half> addrspace(1)* %in
880 %cvt = fpext <8 x half> %val to <8 x float>
881 store <8 x float> %cvt, <8 x float> addrspace(1)* %out
885 define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
886 ; CI-LABEL: global_extload_v16f16_to_v16f32:
888 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
889 ; CI-NEXT: s_waitcnt lgkmcnt(0)
890 ; CI-NEXT: s_add_u32 s4, s2, 16
891 ; CI-NEXT: v_mov_b32_e32 v5, s3
892 ; CI-NEXT: s_addc_u32 s5, s3, 0
893 ; CI-NEXT: v_mov_b32_e32 v0, s4
894 ; CI-NEXT: v_mov_b32_e32 v4, s2
895 ; CI-NEXT: v_mov_b32_e32 v1, s5
896 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
897 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
898 ; CI-NEXT: s_add_u32 s2, s0, 16
899 ; CI-NEXT: s_addc_u32 s3, s1, 0
900 ; CI-NEXT: v_mov_b32_e32 v14, s3
901 ; CI-NEXT: v_mov_b32_e32 v13, s2
902 ; CI-NEXT: s_add_u32 s2, s0, 48
903 ; CI-NEXT: s_addc_u32 s3, s1, 0
904 ; CI-NEXT: s_waitcnt vmcnt(1)
905 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v1
906 ; CI-NEXT: s_waitcnt vmcnt(0)
907 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v7
908 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v6
909 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
910 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
911 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v7
912 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v6
913 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
914 ; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5
915 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
916 ; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12]
917 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
918 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v3
919 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
920 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0
921 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v2
922 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
923 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v5
924 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v4
925 ; CI-NEXT: v_mov_b32_e32 v5, s1
926 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v1
927 ; CI-NEXT: v_cvt_f32_f16_e32 v13, v3
928 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v16
929 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v17
930 ; CI-NEXT: v_mov_b32_e32 v4, s0
931 ; CI-NEXT: s_add_u32 s0, s0, 32
932 ; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
933 ; CI-NEXT: s_addc_u32 s1, s1, 0
934 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
935 ; CI-NEXT: v_mov_b32_e32 v15, s3
936 ; CI-NEXT: v_mov_b32_e32 v17, s1
937 ; CI-NEXT: v_mov_b32_e32 v14, s2
938 ; CI-NEXT: v_mov_b32_e32 v16, s0
939 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
940 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
941 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9]
944 ; VI-LABEL: global_extload_v16f16_to_v16f32:
946 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
947 ; VI-NEXT: s_waitcnt lgkmcnt(0)
948 ; VI-NEXT: v_mov_b32_e32 v0, s2
949 ; VI-NEXT: v_mov_b32_e32 v1, s3
950 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
951 ; VI-NEXT: s_add_u32 s2, s2, 16
952 ; VI-NEXT: s_addc_u32 s3, s3, 0
953 ; VI-NEXT: v_mov_b32_e32 v5, s3
954 ; VI-NEXT: v_mov_b32_e32 v4, s2
955 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
956 ; VI-NEXT: s_add_u32 s2, s0, 16
957 ; VI-NEXT: s_addc_u32 s3, s1, 0
958 ; VI-NEXT: v_mov_b32_e32 v19, s3
959 ; VI-NEXT: v_mov_b32_e32 v18, s2
960 ; VI-NEXT: s_add_u32 s2, s0, 48
961 ; VI-NEXT: v_mov_b32_e32 v17, s1
962 ; VI-NEXT: s_addc_u32 s3, s1, 0
963 ; VI-NEXT: v_mov_b32_e32 v16, s0
964 ; VI-NEXT: s_add_u32 s0, s0, 32
965 ; VI-NEXT: s_addc_u32 s1, s1, 0
966 ; VI-NEXT: v_mov_b32_e32 v21, s3
967 ; VI-NEXT: v_mov_b32_e32 v20, s2
968 ; VI-NEXT: s_waitcnt vmcnt(1)
969 ; VI-NEXT: v_cvt_f32_f16_e32 v14, v3
970 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v2
971 ; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
972 ; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
973 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1
974 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v0
975 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
976 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
977 ; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
978 ; VI-NEXT: s_waitcnt vmcnt(1)
979 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v5
980 ; VI-NEXT: v_cvt_f32_f16_e32 v14, v7
981 ; VI-NEXT: v_cvt_f32_f16_e32 v12, v6
982 ; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
983 ; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
984 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v4
985 ; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
986 ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
987 ; VI-NEXT: v_mov_b32_e32 v5, s1
988 ; VI-NEXT: v_mov_b32_e32 v4, s0
989 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
990 ; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15]
991 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
993 %val = load <16 x half>, <16 x half> addrspace(1)* %in
994 %cvt = fpext <16 x half> %val to <16 x float>
995 store <16 x float> %cvt, <16 x float> addrspace(1)* %out
999 define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
1000 ; GCN-LABEL: global_extload_f16_to_f64:
1002 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1003 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1004 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1005 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1006 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
1007 ; GCN-NEXT: v_mov_b32_e32 v2, s0
1008 ; GCN-NEXT: v_mov_b32_e32 v3, s1
1009 ; GCN-NEXT: s_waitcnt vmcnt(0)
1010 ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
1011 ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1012 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1013 ; GCN-NEXT: s_endpgm
1014 %val = load half, half addrspace(1)* %in
1015 %cvt = fpext half %val to double
1016 store double %cvt, double addrspace(1)* %out
1020 define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
1021 ; CI-LABEL: global_extload_v2f16_to_v2f64:
1023 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1024 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1025 ; CI-NEXT: v_mov_b32_e32 v0, s2
1026 ; CI-NEXT: v_mov_b32_e32 v1, s3
1027 ; CI-NEXT: flat_load_dword v0, v[0:1]
1028 ; CI-NEXT: v_mov_b32_e32 v4, s0
1029 ; CI-NEXT: v_mov_b32_e32 v5, s1
1030 ; CI-NEXT: s_waitcnt vmcnt(0)
1031 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1032 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1033 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
1034 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1035 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1036 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1039 ; VI-LABEL: global_extload_v2f16_to_v2f64:
1041 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1042 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1043 ; VI-NEXT: v_mov_b32_e32 v0, s2
1044 ; VI-NEXT: v_mov_b32_e32 v1, s3
1045 ; VI-NEXT: flat_load_dword v0, v[0:1]
1046 ; VI-NEXT: v_mov_b32_e32 v4, s0
1047 ; VI-NEXT: v_mov_b32_e32 v5, s1
1048 ; VI-NEXT: s_waitcnt vmcnt(0)
1049 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
1050 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1051 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
1052 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1053 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1055 %val = load <2 x half>, <2 x half> addrspace(1)* %in
1056 %cvt = fpext <2 x half> %val to <2 x double>
1057 store <2 x double> %cvt, <2 x double> addrspace(1)* %out
1061 define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
1062 ; CI-LABEL: global_extload_v3f16_to_v3f64:
1064 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1065 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1066 ; CI-NEXT: v_mov_b32_e32 v0, s2
1067 ; CI-NEXT: v_mov_b32_e32 v1, s3
1068 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1069 ; CI-NEXT: s_add_u32 s2, s0, 16
1070 ; CI-NEXT: s_addc_u32 s3, s1, 0
1071 ; CI-NEXT: v_mov_b32_e32 v7, s3
1072 ; CI-NEXT: v_mov_b32_e32 v6, s2
1073 ; CI-NEXT: s_waitcnt vmcnt(0)
1074 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1075 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1076 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1077 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1078 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1
1079 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
1080 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1081 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
1082 ; CI-NEXT: v_mov_b32_e32 v5, s1
1083 ; CI-NEXT: v_mov_b32_e32 v4, s0
1084 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1087 ; VI-LABEL: global_extload_v3f16_to_v3f64:
1089 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1090 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1091 ; VI-NEXT: v_mov_b32_e32 v0, s2
1092 ; VI-NEXT: v_mov_b32_e32 v1, s3
1093 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1094 ; VI-NEXT: s_add_u32 s2, s0, 16
1095 ; VI-NEXT: s_addc_u32 s3, s1, 0
1096 ; VI-NEXT: v_mov_b32_e32 v5, s1
1097 ; VI-NEXT: v_mov_b32_e32 v4, s0
1098 ; VI-NEXT: s_waitcnt vmcnt(0)
1099 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
1100 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v0
1101 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1102 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3
1103 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
1104 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1105 ; VI-NEXT: v_mov_b32_e32 v9, s3
1106 ; VI-NEXT: v_mov_b32_e32 v8, s2
1107 ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7]
1108 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1110 %val = load <3 x half>, <3 x half> addrspace(1)* %in
1111 %cvt = fpext <3 x half> %val to <3 x double>
1112 store <3 x double> %cvt, <3 x double> addrspace(1)* %out
1116 define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1117 ; CI-LABEL: global_extload_v4f16_to_v4f64:
1119 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1120 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1121 ; CI-NEXT: v_mov_b32_e32 v0, s2
1122 ; CI-NEXT: v_mov_b32_e32 v1, s3
1123 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1124 ; CI-NEXT: s_add_u32 s2, s0, 16
1125 ; CI-NEXT: s_addc_u32 s3, s1, 0
1126 ; CI-NEXT: v_mov_b32_e32 v9, s1
1127 ; CI-NEXT: v_mov_b32_e32 v8, s0
1128 ; CI-NEXT: s_waitcnt vmcnt(0)
1129 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
1130 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v1
1131 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
1132 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1133 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1134 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v0
1135 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
1136 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
1137 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2
1138 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
1139 ; CI-NEXT: v_mov_b32_e32 v11, s3
1140 ; CI-NEXT: v_mov_b32_e32 v10, s2
1141 ; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
1142 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1145 ; VI-LABEL: global_extload_v4f16_to_v4f64:
1147 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1148 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1149 ; VI-NEXT: v_mov_b32_e32 v0, s2
1150 ; VI-NEXT: v_mov_b32_e32 v1, s3
1151 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1152 ; VI-NEXT: s_add_u32 s2, s0, 16
1153 ; VI-NEXT: s_addc_u32 s3, s1, 0
1154 ; VI-NEXT: v_mov_b32_e32 v9, s1
1155 ; VI-NEXT: v_mov_b32_e32 v8, s0
1156 ; VI-NEXT: s_waitcnt vmcnt(0)
1157 ; VI-NEXT: v_cvt_f32_f16_e32 v3, v1
1158 ; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1159 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v0
1160 ; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1161 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3
1162 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
1163 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
1164 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10
1165 ; VI-NEXT: v_mov_b32_e32 v11, s3
1166 ; VI-NEXT: v_mov_b32_e32 v10, s2
1167 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
1168 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1170 %val = load <4 x half>, <4 x half> addrspace(1)* %in
1171 %cvt = fpext <4 x half> %val to <4 x double>
1172 store <4 x double> %cvt, <4 x double> addrspace(1)* %out
1176 define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
1177 ; CI-LABEL: global_extload_v8f16_to_v8f64:
1179 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1180 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1181 ; CI-NEXT: v_mov_b32_e32 v0, s2
1182 ; CI-NEXT: v_mov_b32_e32 v1, s3
1183 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1184 ; CI-NEXT: s_add_u32 s2, s0, 48
1185 ; CI-NEXT: s_addc_u32 s3, s1, 0
1186 ; CI-NEXT: v_mov_b32_e32 v7, s3
1187 ; CI-NEXT: v_mov_b32_e32 v6, s2
1188 ; CI-NEXT: s_add_u32 s2, s0, 32
1189 ; CI-NEXT: v_mov_b32_e32 v13, s1
1190 ; CI-NEXT: s_addc_u32 s3, s1, 0
1191 ; CI-NEXT: v_mov_b32_e32 v12, s0
1192 ; CI-NEXT: s_add_u32 s0, s0, 16
1193 ; CI-NEXT: v_mov_b32_e32 v15, s3
1194 ; CI-NEXT: s_addc_u32 s1, s1, 0
1195 ; CI-NEXT: v_mov_b32_e32 v14, s2
1196 ; CI-NEXT: s_waitcnt vmcnt(0)
1197 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3
1198 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
1199 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
1200 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
1201 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v4
1202 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
1203 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v1
1204 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
1205 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v0
1206 ; CI-NEXT: v_cvt_f32_f16_e32 v16, v5
1207 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3
1208 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1209 ; CI-NEXT: v_cvt_f32_f16_e32 v17, v9
1210 ; CI-NEXT: v_cvt_f32_f16_e32 v18, v11
1211 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
1212 ; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
1213 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
1214 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10
1215 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16
1216 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
1217 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18
1218 ; CI-NEXT: v_mov_b32_e32 v17, s1
1219 ; CI-NEXT: v_mov_b32_e32 v16, s0
1220 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
1221 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
1222 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
1225 ; VI-LABEL: global_extload_v8f16_to_v8f64:
1227 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1228 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1229 ; VI-NEXT: v_mov_b32_e32 v0, s2
1230 ; VI-NEXT: v_mov_b32_e32 v1, s3
1231 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1232 ; VI-NEXT: s_add_u32 s2, s0, 48
1233 ; VI-NEXT: s_addc_u32 s3, s1, 0
1234 ; VI-NEXT: v_mov_b32_e32 v8, s3
1235 ; VI-NEXT: v_mov_b32_e32 v7, s2
1236 ; VI-NEXT: s_add_u32 s2, s0, 32
1237 ; VI-NEXT: v_mov_b32_e32 v13, s1
1238 ; VI-NEXT: s_addc_u32 s3, s1, 0
1239 ; VI-NEXT: v_mov_b32_e32 v12, s0
1240 ; VI-NEXT: s_add_u32 s0, s0, 16
1241 ; VI-NEXT: v_mov_b32_e32 v15, s3
1242 ; VI-NEXT: s_addc_u32 s1, s1, 0
1243 ; VI-NEXT: v_mov_b32_e32 v14, s2
1244 ; VI-NEXT: s_waitcnt vmcnt(0)
1245 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
1246 ; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1247 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v3
1248 ; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1249 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v1
1250 ; VI-NEXT: v_cvt_f32_f16_e32 v11, v2
1251 ; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0
1252 ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
1253 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1254 ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1255 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
1256 ; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
1257 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
1258 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10
1259 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2
1260 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17
1261 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16
1262 ; VI-NEXT: v_mov_b32_e32 v17, s1
1263 ; VI-NEXT: v_mov_b32_e32 v16, s0
1264 ; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
1265 ; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
1266 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
1268 %val = load <8 x half>, <8 x half> addrspace(1)* %in
1269 %cvt = fpext <8 x half> %val to <8 x double>
1270 store <8 x double> %cvt, <8 x double> addrspace(1)* %out
1274 define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
1275 ; CI-LABEL: global_extload_v16f16_to_v16f64:
1277 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1278 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1279 ; CI-NEXT: v_mov_b32_e32 v0, s2
1280 ; CI-NEXT: v_mov_b32_e32 v1, s3
1281 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1282 ; CI-NEXT: s_add_u32 s2, s2, 16
1283 ; CI-NEXT: s_addc_u32 s3, s3, 0
1284 ; CI-NEXT: v_mov_b32_e32 v5, s3
1285 ; CI-NEXT: v_mov_b32_e32 v4, s2
1286 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1287 ; CI-NEXT: s_add_u32 s2, s0, 48
1288 ; CI-NEXT: s_addc_u32 s3, s1, 0
1289 ; CI-NEXT: v_mov_b32_e32 v15, s3
1290 ; CI-NEXT: v_mov_b32_e32 v14, s2
1291 ; CI-NEXT: s_add_u32 s2, s0, 32
1292 ; CI-NEXT: s_addc_u32 s3, s1, 0
1293 ; CI-NEXT: v_mov_b32_e32 v17, s3
1294 ; CI-NEXT: v_mov_b32_e32 v16, s2
1295 ; CI-NEXT: s_add_u32 s2, s0, 16
1296 ; CI-NEXT: s_addc_u32 s3, s1, 0
1297 ; CI-NEXT: v_mov_b32_e32 v19, s3
1298 ; CI-NEXT: v_mov_b32_e32 v18, s2
1299 ; CI-NEXT: s_add_u32 s2, s0, 0x70
1300 ; CI-NEXT: s_addc_u32 s3, s1, 0
1301 ; CI-NEXT: v_mov_b32_e32 v13, s1
1302 ; CI-NEXT: v_mov_b32_e32 v12, s0
1303 ; CI-NEXT: s_waitcnt vmcnt(1)
1304 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
1305 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
1306 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v8
1307 ; CI-NEXT: s_waitcnt vmcnt(0)
1308 ; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5
1309 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3
1310 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1311 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
1312 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1313 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
1314 ; CI-NEXT: v_cvt_f32_f16_e32 v21, v5
1315 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
1316 ; CI-NEXT: v_mov_b32_e32 v15, s3
1317 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2
1318 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3
1319 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
1320 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1321 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1322 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
1323 ; CI-NEXT: v_mov_b32_e32 v14, s2
1324 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
1325 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v0
1326 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
1327 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
1328 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
1329 ; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
1330 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
1331 ; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
1332 ; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6
1333 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9
1334 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1335 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
1336 ; CI-NEXT: s_add_u32 s2, s0, 0x60
1337 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
1338 ; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
1339 ; CI-NEXT: s_addc_u32 s3, s1, 0
1340 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1341 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
1342 ; CI-NEXT: v_mov_b32_e32 v17, s3
1343 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
1344 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
1345 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v20
1346 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
1347 ; CI-NEXT: v_cvt_f32_f16_e32 v12, v5
1348 ; CI-NEXT: v_mov_b32_e32 v16, s2
1349 ; CI-NEXT: s_add_u32 s2, s0, 0x50
1350 ; CI-NEXT: s_addc_u32 s3, s1, 0
1351 ; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6
1352 ; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
1353 ; CI-NEXT: s_add_u32 s0, s0, 64
1354 ; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
1355 ; CI-NEXT: s_addc_u32 s1, s1, 0
1356 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21
1357 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
1358 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
1359 ; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12
1360 ; CI-NEXT: v_mov_b32_e32 v19, s3
1361 ; CI-NEXT: v_mov_b32_e32 v13, s1
1362 ; CI-NEXT: v_mov_b32_e32 v18, s2
1363 ; CI-NEXT: v_mov_b32_e32 v12, s0
1364 ; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
1365 ; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
1366 ; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
1369 ; VI-LABEL: global_extload_v16f16_to_v16f64:
1371 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1373 ; VI-NEXT: v_mov_b32_e32 v0, s2
1374 ; VI-NEXT: v_mov_b32_e32 v1, s3
1375 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
1376 ; VI-NEXT: s_add_u32 s2, s2, 16
1377 ; VI-NEXT: s_addc_u32 s3, s3, 0
1378 ; VI-NEXT: v_mov_b32_e32 v0, s2
1379 ; VI-NEXT: v_mov_b32_e32 v1, s3
1380 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1381 ; VI-NEXT: s_add_u32 s2, s0, 48
1382 ; VI-NEXT: s_addc_u32 s3, s1, 0
1383 ; VI-NEXT: v_mov_b32_e32 v14, s3
1384 ; VI-NEXT: v_mov_b32_e32 v13, s2
1385 ; VI-NEXT: s_add_u32 s2, s0, 32
1386 ; VI-NEXT: s_addc_u32 s3, s1, 0
1387 ; VI-NEXT: v_mov_b32_e32 v16, s3
1388 ; VI-NEXT: v_mov_b32_e32 v15, s2
1389 ; VI-NEXT: s_add_u32 s2, s0, 16
1390 ; VI-NEXT: s_addc_u32 s3, s1, 0
1391 ; VI-NEXT: v_mov_b32_e32 v18, s3
1392 ; VI-NEXT: v_mov_b32_e32 v17, s2
1393 ; VI-NEXT: s_add_u32 s2, s0, 0x50
1394 ; VI-NEXT: v_mov_b32_e32 v12, s1
1395 ; VI-NEXT: s_addc_u32 s3, s1, 0
1396 ; VI-NEXT: v_mov_b32_e32 v11, s0
1397 ; VI-NEXT: s_waitcnt vmcnt(1)
1398 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7
1399 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1400 ; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
1401 ; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
1402 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
1404 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
1405 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1406 ; VI-NEXT: s_waitcnt vmcnt(1)
1407 ; VI-NEXT: v_cvt_f32_f16_e32 v10, v2
1408 ; VI-NEXT: v_mov_b32_e32 v14, s3
1409 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
1410 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
1411 ; VI-NEXT: v_mov_b32_e32 v13, s2
1412 ; VI-NEXT: s_add_u32 s2, s0, 64
1413 ; VI-NEXT: s_addc_u32 s3, s1, 0
1414 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
1415 ; VI-NEXT: v_mov_b32_e32 v16, s3
1416 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
1417 ; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1418 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v4
1419 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1420 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
1421 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
1422 ; VI-NEXT: v_mov_b32_e32 v15, s2
1423 ; VI-NEXT: s_add_u32 s2, s0, 0x70
1424 ; VI-NEXT: s_addc_u32 s3, s1, 0
1425 ; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7]
1426 ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1427 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8
1428 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9
1429 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1430 ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1431 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v1
1432 ; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
1433 ; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1434 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
1435 ; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9
1436 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0
1437 ; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2
1438 ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10
1439 ; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
1440 ; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
1441 ; VI-NEXT: s_add_u32 s0, s0, 0x60
1442 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4]
1443 ; VI-NEXT: s_addc_u32 s1, s1, 0
1444 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7
1445 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
1446 ; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8
1447 ; VI-NEXT: v_mov_b32_e32 v20, s3
1448 ; VI-NEXT: v_mov_b32_e32 v14, s1
1449 ; VI-NEXT: v_mov_b32_e32 v19, s2
1450 ; VI-NEXT: v_mov_b32_e32 v13, s0
1451 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12]
1452 ; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3]
1453 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8]
1455 %val = load <16 x half>, <16 x half> addrspace(1)* %in
1456 %cvt = fpext <16 x half> %val to <16 x double>
1457 store <16 x double> %cvt, <16 x double> addrspace(1)* %out
1461 define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
1462 ; GCN-LABEL: global_truncstore_f32_to_f16:
1464 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1465 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1466 ; GCN-NEXT: v_mov_b32_e32 v0, s2
1467 ; GCN-NEXT: v_mov_b32_e32 v1, s3
1468 ; GCN-NEXT: flat_load_dword v0, v[0:1]
1469 ; GCN-NEXT: v_mov_b32_e32 v1, s1
1470 ; GCN-NEXT: s_waitcnt vmcnt(0)
1471 ; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0
1472 ; GCN-NEXT: v_mov_b32_e32 v0, s0
1473 ; GCN-NEXT: flat_store_short v[0:1], v2
1474 ; GCN-NEXT: s_endpgm
1475 %val = load float, float addrspace(1)* %in
1476 %cvt = fptrunc float %val to half
1477 store half %cvt, half addrspace(1)* %out
1481 define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
1482 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
1484 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1485 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1486 ; CI-NEXT: v_mov_b32_e32 v0, s2
1487 ; CI-NEXT: v_mov_b32_e32 v1, s3
1488 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1489 ; CI-NEXT: s_waitcnt vmcnt(0)
1490 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
1491 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
1492 ; CI-NEXT: v_mov_b32_e32 v0, s0
1493 ; CI-NEXT: v_mov_b32_e32 v1, s1
1494 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1495 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
1496 ; CI-NEXT: flat_store_dword v[0:1], v2
1499 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
1501 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1502 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1503 ; VI-NEXT: v_mov_b32_e32 v0, s2
1504 ; VI-NEXT: v_mov_b32_e32 v1, s3
1505 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1506 ; VI-NEXT: s_waitcnt vmcnt(0)
1507 ; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1508 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v0
1509 ; VI-NEXT: v_mov_b32_e32 v0, s0
1510 ; VI-NEXT: v_mov_b32_e32 v1, s1
1511 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
1512 ; VI-NEXT: flat_store_dword v[0:1], v2
1514 %val = load <2 x float>, <2 x float> addrspace(1)* %in
1515 %cvt = fptrunc <2 x float> %val to <2 x half>
1516 store <2 x half> %cvt, <2 x half> addrspace(1)* %out
1520 define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
1521 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
1523 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1524 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1525 ; CI-NEXT: v_mov_b32_e32 v0, s2
1526 ; CI-NEXT: v_mov_b32_e32 v1, s3
1527 ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1528 ; CI-NEXT: s_add_u32 s2, s0, 4
1529 ; CI-NEXT: s_addc_u32 s3, s1, 0
1530 ; CI-NEXT: s_waitcnt vmcnt(0)
1531 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v1
1532 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
1533 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v0
1534 ; CI-NEXT: v_mov_b32_e32 v0, s2
1535 ; CI-NEXT: v_mov_b32_e32 v1, s3
1536 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1537 ; CI-NEXT: flat_store_short v[0:1], v2
1538 ; CI-NEXT: v_mov_b32_e32 v0, s0
1539 ; CI-NEXT: v_or_b32_e32 v2, v4, v3
1540 ; CI-NEXT: v_mov_b32_e32 v1, s1
1541 ; CI-NEXT: flat_store_dword v[0:1], v2
1544 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
1546 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1547 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1548 ; VI-NEXT: v_mov_b32_e32 v0, s2
1549 ; VI-NEXT: v_mov_b32_e32 v1, s3
1550 ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1551 ; VI-NEXT: s_add_u32 s2, s0, 4
1552 ; VI-NEXT: s_addc_u32 s3, s1, 0
1553 ; VI-NEXT: s_waitcnt vmcnt(0)
1554 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1555 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1556 ; VI-NEXT: v_cvt_f16_f32_e32 v4, v0
1557 ; VI-NEXT: v_mov_b32_e32 v0, s2
1558 ; VI-NEXT: v_mov_b32_e32 v1, s3
1559 ; VI-NEXT: flat_store_short v[0:1], v2
1560 ; VI-NEXT: v_mov_b32_e32 v0, s0
1561 ; VI-NEXT: v_or_b32_e32 v3, v4, v3
1562 ; VI-NEXT: v_mov_b32_e32 v1, s1
1563 ; VI-NEXT: flat_store_dword v[0:1], v3
1565 %val = load <3 x float>, <3 x float> addrspace(1)* %in
1566 %cvt = fptrunc <3 x float> %val to <3 x half>
1567 store <3 x half> %cvt, <3 x half> addrspace(1)* %out
1571 define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
1572 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
1574 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1575 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1576 ; CI-NEXT: v_mov_b32_e32 v0, s2
1577 ; CI-NEXT: v_mov_b32_e32 v1, s3
1578 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1579 ; CI-NEXT: v_mov_b32_e32 v4, s0
1580 ; CI-NEXT: v_mov_b32_e32 v5, s1
1581 ; CI-NEXT: s_waitcnt vmcnt(0)
1582 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
1583 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1584 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
1585 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1586 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1587 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v1
1588 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
1589 ; CI-NEXT: v_or_b32_e32 v0, v0, v6
1590 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
1593 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
1595 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1596 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1597 ; VI-NEXT: v_mov_b32_e32 v0, s2
1598 ; VI-NEXT: v_mov_b32_e32 v1, s3
1599 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1600 ; VI-NEXT: s_waitcnt vmcnt(0)
1601 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1602 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1603 ; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1604 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v0
1605 ; VI-NEXT: v_mov_b32_e32 v0, s0
1606 ; VI-NEXT: v_mov_b32_e32 v1, s1
1607 ; VI-NEXT: v_or_b32_e32 v3, v2, v3
1608 ; VI-NEXT: v_or_b32_e32 v2, v5, v4
1609 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1611 %val = load <4 x float>, <4 x float> addrspace(1)* %in
1612 %cvt = fptrunc <4 x float> %val to <4 x half>
1613 store <4 x half> %cvt, <4 x half> addrspace(1)* %out
1617 define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
1618 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
1620 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1621 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1622 ; CI-NEXT: v_mov_b32_e32 v0, s2
1623 ; CI-NEXT: v_mov_b32_e32 v1, s3
1624 ; CI-NEXT: s_add_u32 s2, s2, 16
1625 ; CI-NEXT: s_addc_u32 s3, s3, 0
1626 ; CI-NEXT: v_mov_b32_e32 v5, s3
1627 ; CI-NEXT: v_mov_b32_e32 v4, s2
1628 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1629 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1630 ; CI-NEXT: v_mov_b32_e32 v8, s0
1631 ; CI-NEXT: v_mov_b32_e32 v9, s1
1632 ; CI-NEXT: s_waitcnt vmcnt(1)
1633 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
1634 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1635 ; CI-NEXT: s_waitcnt vmcnt(0)
1636 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
1637 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
1638 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
1639 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1640 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
1641 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
1642 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1643 ; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v1
1644 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
1645 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
1646 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
1647 ; CI-NEXT: v_or_b32_e32 v0, v0, v10
1648 ; CI-NEXT: v_or_b32_e32 v3, v6, v7
1649 ; CI-NEXT: v_or_b32_e32 v2, v4, v5
1650 ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1653 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
1655 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1656 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1657 ; VI-NEXT: v_mov_b32_e32 v0, s2
1658 ; VI-NEXT: v_mov_b32_e32 v1, s3
1659 ; VI-NEXT: s_add_u32 s2, s2, 16
1660 ; VI-NEXT: s_addc_u32 s3, s3, 0
1661 ; VI-NEXT: v_mov_b32_e32 v5, s3
1662 ; VI-NEXT: v_mov_b32_e32 v4, s2
1663 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1664 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1665 ; VI-NEXT: v_mov_b32_e32 v8, s0
1666 ; VI-NEXT: v_mov_b32_e32 v9, s1
1667 ; VI-NEXT: s_waitcnt vmcnt(1)
1668 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1669 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1670 ; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1671 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
1672 ; VI-NEXT: s_waitcnt vmcnt(0)
1673 ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1674 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
1675 ; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1676 ; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
1677 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
1678 ; VI-NEXT: v_or_b32_e32 v0, v0, v10
1679 ; VI-NEXT: v_or_b32_e32 v3, v6, v7
1680 ; VI-NEXT: v_or_b32_e32 v2, v4, v5
1681 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
1683 %val = load <8 x float>, <8 x float> addrspace(1)* %in
1684 %cvt = fptrunc <8 x float> %val to <8 x half>
1685 store <8 x half> %cvt, <8 x half> addrspace(1)* %out
1689 define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
1690 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
1692 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1693 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1694 ; CI-NEXT: s_add_u32 s4, s2, 32
1695 ; CI-NEXT: s_addc_u32 s5, s3, 0
1696 ; CI-NEXT: v_mov_b32_e32 v0, s4
1697 ; CI-NEXT: v_mov_b32_e32 v1, s5
1698 ; CI-NEXT: s_add_u32 s4, s2, 48
1699 ; CI-NEXT: s_addc_u32 s5, s3, 0
1700 ; CI-NEXT: v_mov_b32_e32 v9, s3
1701 ; CI-NEXT: v_mov_b32_e32 v4, s4
1702 ; CI-NEXT: v_mov_b32_e32 v8, s2
1703 ; CI-NEXT: s_add_u32 s2, s2, 16
1704 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1705 ; CI-NEXT: v_mov_b32_e32 v5, s5
1706 ; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1707 ; CI-NEXT: s_addc_u32 s3, s3, 0
1708 ; CI-NEXT: v_mov_b32_e32 v13, s3
1709 ; CI-NEXT: v_mov_b32_e32 v12, s2
1710 ; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
1711 ; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
1712 ; CI-NEXT: s_add_u32 s2, s0, 16
1713 ; CI-NEXT: s_addc_u32 s3, s1, 0
1714 ; CI-NEXT: s_waitcnt vmcnt(3)
1715 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
1716 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
1717 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1718 ; CI-NEXT: s_waitcnt vmcnt(2)
1719 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
1720 ; CI-NEXT: v_cvt_f16_f32_e32 v16, v5
1721 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1722 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
1723 ; CI-NEXT: v_cvt_f16_f32_e32 v17, v4
1724 ; CI-NEXT: s_waitcnt vmcnt(1)
1725 ; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
1726 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
1727 ; CI-NEXT: s_waitcnt vmcnt(0)
1728 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
1729 ; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
1730 ; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
1731 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
1732 ; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
1733 ; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
1734 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
1735 ; CI-NEXT: v_mov_b32_e32 v5, s3
1736 ; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
1737 ; CI-NEXT: v_or_b32_e32 v1, v2, v3
1738 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
1739 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16
1740 ; CI-NEXT: v_mov_b32_e32 v4, s2
1741 ; CI-NEXT: v_or_b32_e32 v0, v0, v18
1742 ; CI-NEXT: v_or_b32_e32 v3, v6, v2
1743 ; CI-NEXT: v_or_b32_e32 v2, v17, v7
1744 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11
1745 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9
1746 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15
1747 ; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13
1748 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1749 ; CI-NEXT: v_mov_b32_e32 v5, s1
1750 ; CI-NEXT: v_or_b32_e32 v1, v10, v6
1751 ; CI-NEXT: v_or_b32_e32 v0, v8, v7
1752 ; CI-NEXT: v_or_b32_e32 v3, v14, v9
1753 ; CI-NEXT: v_or_b32_e32 v2, v12, v11
1754 ; CI-NEXT: v_mov_b32_e32 v4, s0
1755 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1758 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
1760 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1761 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1762 ; VI-NEXT: s_add_u32 s4, s2, 32
1763 ; VI-NEXT: s_addc_u32 s5, s3, 0
1764 ; VI-NEXT: v_mov_b32_e32 v0, s4
1765 ; VI-NEXT: v_mov_b32_e32 v1, s5
1766 ; VI-NEXT: s_add_u32 s4, s2, 48
1767 ; VI-NEXT: s_addc_u32 s5, s3, 0
1768 ; VI-NEXT: v_mov_b32_e32 v9, s3
1769 ; VI-NEXT: v_mov_b32_e32 v4, s4
1770 ; VI-NEXT: v_mov_b32_e32 v8, s2
1771 ; VI-NEXT: s_add_u32 s2, s2, 16
1772 ; VI-NEXT: v_mov_b32_e32 v5, s5
1773 ; VI-NEXT: s_addc_u32 s3, s3, 0
1774 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1775 ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
1776 ; VI-NEXT: v_mov_b32_e32 v13, s3
1777 ; VI-NEXT: v_mov_b32_e32 v12, s2
1778 ; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
1779 ; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
1780 ; VI-NEXT: s_add_u32 s2, s0, 16
1781 ; VI-NEXT: s_addc_u32 s3, s1, 0
1782 ; VI-NEXT: s_waitcnt vmcnt(3)
1783 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1784 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1785 ; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1786 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
1787 ; VI-NEXT: s_waitcnt vmcnt(2)
1788 ; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1789 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
1790 ; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1791 ; VI-NEXT: v_cvt_f16_f32_e32 v18, v4
1792 ; VI-NEXT: s_waitcnt vmcnt(1)
1793 ; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1794 ; VI-NEXT: v_cvt_f16_f32_e32 v10, v10
1795 ; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1796 ; VI-NEXT: v_cvt_f16_f32_e32 v8, v8
1797 ; VI-NEXT: s_waitcnt vmcnt(0)
1798 ; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1799 ; VI-NEXT: v_cvt_f16_f32_e32 v14, v14
1800 ; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
1801 ; VI-NEXT: v_cvt_f16_f32_e32 v12, v12
1802 ; VI-NEXT: v_mov_b32_e32 v5, s3
1803 ; VI-NEXT: v_mov_b32_e32 v4, s2
1804 ; VI-NEXT: v_or_b32_e32 v1, v2, v3
1805 ; VI-NEXT: v_or_b32_e32 v0, v0, v16
1806 ; VI-NEXT: v_or_b32_e32 v3, v6, v7
1807 ; VI-NEXT: v_or_b32_e32 v2, v18, v17
1808 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1809 ; VI-NEXT: v_mov_b32_e32 v5, s1
1810 ; VI-NEXT: v_or_b32_e32 v1, v10, v11
1811 ; VI-NEXT: v_or_b32_e32 v0, v8, v9
1812 ; VI-NEXT: v_or_b32_e32 v3, v14, v15
1813 ; VI-NEXT: v_or_b32_e32 v2, v12, v13
1814 ; VI-NEXT: v_mov_b32_e32 v4, s0
1815 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1817 %val = load <16 x float>, <16 x float> addrspace(1)* %in
1818 %cvt = fptrunc <16 x float> %val to <16 x half>
1819 store <16 x half> %cvt, <16 x half> addrspace(1)* %out
1823 ; FIXME: Unsafe math should fold conversions away
1824 define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
1825 ; CI-LABEL: fadd_f16:
1827 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
1828 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1829 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
1830 ; CI-NEXT: s_lshr_b32 s0, s0, 16
1831 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
1832 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1833 ; CI-NEXT: v_add_f32_e32 v0, v0, v1
1834 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
1835 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1836 ; CI-NEXT: v_mov_b32_e32 v0, s0
1837 ; CI-NEXT: v_mov_b32_e32 v1, s1
1838 ; CI-NEXT: flat_store_short v[0:1], v2
1841 ; VI-LABEL: fadd_f16:
1843 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
1844 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1845 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1846 ; VI-NEXT: s_lshr_b32 s3, s2, 16
1847 ; VI-NEXT: v_mov_b32_e32 v0, s3
1848 ; VI-NEXT: v_add_f16_e32 v2, s2, v0
1849 ; VI-NEXT: v_mov_b32_e32 v0, s0
1850 ; VI-NEXT: v_mov_b32_e32 v1, s1
1851 ; VI-NEXT: flat_store_short v[0:1], v2
1853 %add = fadd half %a, %b
1854 store half %add, half addrspace(1)* %out, align 4
1858 define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
1859 ; CI-LABEL: fadd_v2f16:
1861 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1862 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1863 ; CI-NEXT: s_lshr_b32 s4, s2, 16
1864 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
1865 ; CI-NEXT: s_lshr_b32 s2, s3, 16
1866 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
1867 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
1868 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s2
1869 ; CI-NEXT: v_add_f32_e32 v0, v0, v1
1870 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1871 ; CI-NEXT: v_add_f32_e32 v1, v2, v3
1872 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1873 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1874 ; CI-NEXT: v_or_b32_e32 v2, v0, v1
1875 ; CI-NEXT: v_mov_b32_e32 v0, s0
1876 ; CI-NEXT: v_mov_b32_e32 v1, s1
1877 ; CI-NEXT: flat_store_dword v[0:1], v2
1880 ; VI-LABEL: fadd_v2f16:
1882 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1883 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1884 ; VI-NEXT: s_lshr_b32 s4, s3, 16
1885 ; VI-NEXT: s_lshr_b32 s5, s2, 16
1886 ; VI-NEXT: v_mov_b32_e32 v0, s3
1887 ; VI-NEXT: v_mov_b32_e32 v1, s4
1888 ; VI-NEXT: v_mov_b32_e32 v2, s5
1889 ; VI-NEXT: v_add_f16_e32 v0, s2, v0
1890 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1891 ; VI-NEXT: v_or_b32_e32 v2, v0, v1
1892 ; VI-NEXT: v_mov_b32_e32 v0, s0
1893 ; VI-NEXT: v_mov_b32_e32 v1, s1
1894 ; VI-NEXT: flat_store_dword v[0:1], v2
1896 %add = fadd <2 x half> %a, %b
1897 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
1901 define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
1902 ; CI-LABEL: fadd_v4f16:
1904 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1905 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1906 ; CI-NEXT: v_mov_b32_e32 v0, s2
1907 ; CI-NEXT: v_mov_b32_e32 v1, s3
1908 ; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1909 ; CI-NEXT: v_mov_b32_e32 v4, s0
1910 ; CI-NEXT: v_mov_b32_e32 v5, s1
1911 ; CI-NEXT: s_waitcnt vmcnt(0)
1912 ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0
1913 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
1914 ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1
1915 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1916 ; CI-NEXT: v_cvt_f32_f16_e32 v8, v2
1917 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1918 ; CI-NEXT: v_cvt_f32_f16_e32 v9, v3
1919 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
1920 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
1921 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
1922 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
1923 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
1924 ; CI-NEXT: v_add_f32_e32 v7, v7, v9
1925 ; CI-NEXT: v_add_f32_e32 v6, v6, v8
1926 ; CI-NEXT: v_add_f32_e32 v1, v1, v3
1927 ; CI-NEXT: v_add_f32_e32 v0, v0, v2
1928 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
1929 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
1930 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v7
1931 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
1932 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1933 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1934 ; CI-NEXT: v_or_b32_e32 v1, v2, v1
1935 ; CI-NEXT: v_or_b32_e32 v0, v3, v0
1936 ; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
1939 ; VI-LABEL: fadd_v4f16:
1941 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1942 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1943 ; VI-NEXT: v_mov_b32_e32 v0, s2
1944 ; VI-NEXT: v_mov_b32_e32 v1, s3
1945 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1946 ; VI-NEXT: v_mov_b32_e32 v4, s0
1947 ; VI-NEXT: v_mov_b32_e32 v5, s1
1948 ; VI-NEXT: s_waitcnt vmcnt(0)
1949 ; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1950 ; VI-NEXT: v_add_f16_e32 v1, v1, v3
1951 ; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1952 ; VI-NEXT: v_add_f16_e32 v0, v0, v2
1953 ; VI-NEXT: v_or_b32_e32 v1, v1, v6
1954 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
1955 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
1957 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
1958 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
1959 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
1960 %result = fadd <4 x half> %a, %b
1961 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16
1965 define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
1966 ; CI-LABEL: fadd_v8f16:
1968 ; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4
1969 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1970 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1971 ; CI-NEXT: s_lshr_b32 s2, s8, 16
1972 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
1973 ; CI-NEXT: s_lshr_b32 s2, s11, 16
1974 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s2
1975 ; CI-NEXT: s_lshr_b32 s2, s12, 16
1976 ; CI-NEXT: v_cvt_f32_f16_e32 v8, s2
1977 ; CI-NEXT: s_lshr_b32 s2, s13, 16
1978 ; CI-NEXT: s_lshr_b32 s3, s9, 16
1979 ; CI-NEXT: v_cvt_f32_f16_e32 v9, s2
1980 ; CI-NEXT: s_lshr_b32 s2, s14, 16
1981 ; CI-NEXT: s_lshr_b32 s4, s10, 16
1982 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
1983 ; CI-NEXT: v_cvt_f32_f16_e32 v10, s2
1984 ; CI-NEXT: s_lshr_b32 s2, s15, 16
1985 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4
1986 ; CI-NEXT: v_cvt_f32_f16_e32 v4, s8
1987 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s9
1988 ; CI-NEXT: v_cvt_f32_f16_e32 v11, s2
1989 ; CI-NEXT: v_cvt_f32_f16_e32 v12, s12
1990 ; CI-NEXT: v_cvt_f32_f16_e32 v13, s13
1991 ; CI-NEXT: v_cvt_f32_f16_e32 v6, s10
1992 ; CI-NEXT: v_cvt_f32_f16_e32 v7, s11
1993 ; CI-NEXT: v_cvt_f32_f16_e32 v14, s15
1994 ; CI-NEXT: v_cvt_f32_f16_e32 v15, s14
1995 ; CI-NEXT: v_add_f32_e32 v1, v1, v9
1996 ; CI-NEXT: v_add_f32_e32 v0, v0, v8
1997 ; CI-NEXT: v_add_f32_e32 v3, v3, v11
1998 ; CI-NEXT: v_add_f32_e32 v2, v2, v10
1999 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
2000 ; CI-NEXT: v_add_f32_e32 v5, v5, v13
2001 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
2002 ; CI-NEXT: v_add_f32_e32 v4, v4, v12
2003 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
2004 ; CI-NEXT: v_add_f32_e32 v7, v7, v14
2005 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
2006 ; CI-NEXT: v_add_f32_e32 v6, v6, v15
2007 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
2008 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
2009 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
2010 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
2011 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2012 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2013 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2014 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2015 ; CI-NEXT: v_or_b32_e32 v1, v5, v1
2016 ; CI-NEXT: v_or_b32_e32 v0, v4, v0
2017 ; CI-NEXT: v_mov_b32_e32 v5, s1
2018 ; CI-NEXT: v_or_b32_e32 v3, v7, v3
2019 ; CI-NEXT: v_or_b32_e32 v2, v6, v2
2020 ; CI-NEXT: v_mov_b32_e32 v4, s0
2021 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2024 ; VI-LABEL: fadd_v8f16:
2026 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
2027 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2028 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2029 ; VI-NEXT: s_lshr_b32 s2, s15, 16
2030 ; VI-NEXT: s_lshr_b32 s3, s11, 16
2031 ; VI-NEXT: v_mov_b32_e32 v0, s15
2032 ; VI-NEXT: v_mov_b32_e32 v1, s2
2033 ; VI-NEXT: v_mov_b32_e32 v2, s3
2034 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2035 ; VI-NEXT: v_add_f16_e32 v0, s11, v0
2036 ; VI-NEXT: s_lshr_b32 s2, s14, 16
2037 ; VI-NEXT: s_lshr_b32 s3, s10, 16
2038 ; VI-NEXT: v_or_b32_e32 v3, v0, v1
2039 ; VI-NEXT: v_mov_b32_e32 v0, s2
2040 ; VI-NEXT: v_mov_b32_e32 v1, s3
2041 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2042 ; VI-NEXT: v_mov_b32_e32 v1, s14
2043 ; VI-NEXT: v_add_f16_e32 v1, s10, v1
2044 ; VI-NEXT: s_lshr_b32 s2, s13, 16
2045 ; VI-NEXT: s_lshr_b32 s3, s9, 16
2046 ; VI-NEXT: v_or_b32_e32 v2, v1, v0
2047 ; VI-NEXT: v_mov_b32_e32 v0, s2
2048 ; VI-NEXT: v_mov_b32_e32 v1, s3
2049 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2050 ; VI-NEXT: v_mov_b32_e32 v1, s13
2051 ; VI-NEXT: v_add_f16_e32 v1, s9, v1
2052 ; VI-NEXT: s_lshr_b32 s2, s12, 16
2053 ; VI-NEXT: s_lshr_b32 s3, s8, 16
2054 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
2055 ; VI-NEXT: v_mov_b32_e32 v0, s2
2056 ; VI-NEXT: v_mov_b32_e32 v4, s3
2057 ; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2058 ; VI-NEXT: v_mov_b32_e32 v4, s12
2059 ; VI-NEXT: v_add_f16_e32 v4, s8, v4
2060 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
2061 ; VI-NEXT: v_mov_b32_e32 v5, s1
2062 ; VI-NEXT: v_mov_b32_e32 v4, s0
2063 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2065 %add = fadd <8 x half> %a, %b
2066 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
2070 define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
2071 ; GCN-LABEL: test_bitcast_from_half:
2073 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2074 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2075 ; GCN-NEXT: v_mov_b32_e32 v0, s0
2076 ; GCN-NEXT: v_mov_b32_e32 v1, s1
2077 ; GCN-NEXT: flat_load_ushort v2, v[0:1]
2078 ; GCN-NEXT: v_mov_b32_e32 v0, s2
2079 ; GCN-NEXT: v_mov_b32_e32 v1, s3
2080 ; GCN-NEXT: s_waitcnt vmcnt(0)
2081 ; GCN-NEXT: flat_store_short v[0:1], v2
2082 ; GCN-NEXT: s_endpgm
2083 %val = load half, half addrspace(1)* %in
2084 %val_int = bitcast half %val to i16
2085 store i16 %val_int, i16 addrspace(1)* %out
2089 define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
2090 ; GCN-LABEL: test_bitcast_to_half:
2092 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2093 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
2094 ; GCN-NEXT: v_mov_b32_e32 v0, s2
2095 ; GCN-NEXT: v_mov_b32_e32 v1, s3
2096 ; GCN-NEXT: flat_load_ushort v2, v[0:1]
2097 ; GCN-NEXT: v_mov_b32_e32 v0, s0
2098 ; GCN-NEXT: v_mov_b32_e32 v1, s1
2099 ; GCN-NEXT: s_waitcnt vmcnt(0)
2100 ; GCN-NEXT: flat_store_short v[0:1], v2
2101 ; GCN-NEXT: s_endpgm
2102 %val = load i16, i16 addrspace(1)* %in
2103 %val_fp = bitcast i16 %val to half
2104 store half %val_fp, half addrspace(1)* %out
2108 attributes #0 = { nounwind }