1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fpext_f16_to_f32(
8 ; SI-LABEL: fpext_f16_to_f32:
9 ; SI: ; %bb.0: ; %entry
10 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_mov_b32 s10, s6
14 ; SI-NEXT: s_mov_b32 s11, s7
15 ; SI-NEXT: s_waitcnt lgkmcnt(0)
16 ; SI-NEXT: s_mov_b32 s8, s2
17 ; SI-NEXT: s_mov_b32 s9, s3
18 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
19 ; SI-NEXT: s_mov_b32 s4, s0
20 ; SI-NEXT: s_mov_b32 s5, s1
21 ; SI-NEXT: s_waitcnt vmcnt(0)
22 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
23 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
26 ; GFX89-LABEL: fpext_f16_to_f32:
27 ; GFX89: ; %bb.0: ; %entry
28 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
29 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
30 ; GFX89-NEXT: s_mov_b32 s6, -1
31 ; GFX89-NEXT: s_mov_b32 s10, s6
32 ; GFX89-NEXT: s_mov_b32 s11, s7
33 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
34 ; GFX89-NEXT: s_mov_b32 s8, s2
35 ; GFX89-NEXT: s_mov_b32 s9, s3
36 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
37 ; GFX89-NEXT: s_mov_b32 s4, s0
38 ; GFX89-NEXT: s_mov_b32 s5, s1
39 ; GFX89-NEXT: s_waitcnt vmcnt(0)
40 ; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
41 ; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
42 ; GFX89-NEXT: s_endpgm
44 ; GFX11-LABEL: fpext_f16_to_f32:
45 ; GFX11: ; %bb.0: ; %entry
46 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
47 ; GFX11-NEXT: s_mov_b32 s6, -1
48 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
49 ; GFX11-NEXT: s_mov_b32 s10, s6
50 ; GFX11-NEXT: s_mov_b32 s11, s7
51 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX11-NEXT: s_mov_b32 s8, s2
53 ; GFX11-NEXT: s_mov_b32 s9, s3
54 ; GFX11-NEXT: s_mov_b32 s4, s0
55 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
56 ; GFX11-NEXT: s_mov_b32 s5, s1
57 ; GFX11-NEXT: s_waitcnt vmcnt(0)
58 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
59 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
61 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
62 ; GFX11-NEXT: s_endpgm
64 ptr addrspace(1) %a) #0 {
66 %a.val = load half, ptr addrspace(1) %a
67 %r.val = fpext half %a.val to float
68 store float %r.val, ptr addrspace(1) %r
72 define amdgpu_kernel void @fpext_f16_to_f64(
73 ; SI-LABEL: fpext_f16_to_f64:
74 ; SI: ; %bb.0: ; %entry
75 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
76 ; SI-NEXT: s_mov_b32 s7, 0xf000
77 ; SI-NEXT: s_mov_b32 s6, -1
78 ; SI-NEXT: s_mov_b32 s10, s6
79 ; SI-NEXT: s_mov_b32 s11, s7
80 ; SI-NEXT: s_waitcnt lgkmcnt(0)
81 ; SI-NEXT: s_mov_b32 s8, s2
82 ; SI-NEXT: s_mov_b32 s9, s3
83 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
84 ; SI-NEXT: s_mov_b32 s4, s0
85 ; SI-NEXT: s_mov_b32 s5, s1
86 ; SI-NEXT: s_waitcnt vmcnt(0)
87 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
88 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
89 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
92 ; GFX89-LABEL: fpext_f16_to_f64:
93 ; GFX89: ; %bb.0: ; %entry
94 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
95 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
96 ; GFX89-NEXT: s_mov_b32 s6, -1
97 ; GFX89-NEXT: s_mov_b32 s10, s6
98 ; GFX89-NEXT: s_mov_b32 s11, s7
99 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX89-NEXT: s_mov_b32 s8, s2
101 ; GFX89-NEXT: s_mov_b32 s9, s3
102 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
103 ; GFX89-NEXT: s_mov_b32 s4, s0
104 ; GFX89-NEXT: s_mov_b32 s5, s1
105 ; GFX89-NEXT: s_waitcnt vmcnt(0)
106 ; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
107 ; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
108 ; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
109 ; GFX89-NEXT: s_endpgm
111 ; GFX11-LABEL: fpext_f16_to_f64:
112 ; GFX11: ; %bb.0: ; %entry
113 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
114 ; GFX11-NEXT: s_mov_b32 s6, -1
115 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
116 ; GFX11-NEXT: s_mov_b32 s10, s6
117 ; GFX11-NEXT: s_mov_b32 s11, s7
118 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
119 ; GFX11-NEXT: s_mov_b32 s8, s2
120 ; GFX11-NEXT: s_mov_b32 s9, s3
121 ; GFX11-NEXT: s_mov_b32 s4, s0
122 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
123 ; GFX11-NEXT: s_mov_b32 s5, s1
124 ; GFX11-NEXT: s_waitcnt vmcnt(0)
125 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
127 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
128 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
129 ; GFX11-NEXT: s_nop 0
130 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
131 ; GFX11-NEXT: s_endpgm
133 ptr addrspace(1) %a) #0 {
135 %a.val = load half, ptr addrspace(1) %a
136 %r.val = fpext half %a.val to double
137 store double %r.val, ptr addrspace(1) %r
141 define amdgpu_kernel void @fpext_v2f16_to_v2f32(
142 ; SI-LABEL: fpext_v2f16_to_v2f32:
143 ; SI: ; %bb.0: ; %entry
144 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
145 ; SI-NEXT: s_mov_b32 s7, 0xf000
146 ; SI-NEXT: s_mov_b32 s6, -1
147 ; SI-NEXT: s_mov_b32 s10, s6
148 ; SI-NEXT: s_mov_b32 s11, s7
149 ; SI-NEXT: s_waitcnt lgkmcnt(0)
150 ; SI-NEXT: s_mov_b32 s8, s2
151 ; SI-NEXT: s_mov_b32 s9, s3
152 ; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0
153 ; SI-NEXT: s_mov_b32 s4, s0
154 ; SI-NEXT: s_mov_b32 s5, s1
155 ; SI-NEXT: s_waitcnt vmcnt(0)
156 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
157 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
158 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
159 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
162 ; GFX89-LABEL: fpext_v2f16_to_v2f32:
163 ; GFX89: ; %bb.0: ; %entry
164 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
165 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
166 ; GFX89-NEXT: s_mov_b32 s6, -1
167 ; GFX89-NEXT: s_mov_b32 s10, s6
168 ; GFX89-NEXT: s_mov_b32 s11, s7
169 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
170 ; GFX89-NEXT: s_mov_b32 s8, s2
171 ; GFX89-NEXT: s_mov_b32 s9, s3
172 ; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0
173 ; GFX89-NEXT: s_mov_b32 s4, s0
174 ; GFX89-NEXT: s_mov_b32 s5, s1
175 ; GFX89-NEXT: s_waitcnt vmcnt(0)
176 ; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1
177 ; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
178 ; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
179 ; GFX89-NEXT: s_endpgm
181 ; GFX11-LABEL: fpext_v2f16_to_v2f32:
182 ; GFX11: ; %bb.0: ; %entry
183 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
184 ; GFX11-NEXT: s_mov_b32 s6, -1
185 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
186 ; GFX11-NEXT: s_mov_b32 s10, s6
187 ; GFX11-NEXT: s_mov_b32 s11, s7
188 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
189 ; GFX11-NEXT: s_mov_b32 s8, s2
190 ; GFX11-NEXT: s_mov_b32 s9, s3
191 ; GFX11-NEXT: s_mov_b32 s4, s0
192 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
193 ; GFX11-NEXT: s_mov_b32 s5, s1
194 ; GFX11-NEXT: s_waitcnt vmcnt(0)
195 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
196 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
197 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
198 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
199 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
200 ; GFX11-NEXT: s_nop 0
201 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
202 ; GFX11-NEXT: s_endpgm
204 ptr addrspace(1) %a) #0 {
206 %a.val = load <2 x half>, ptr addrspace(1) %a
207 %r.val = fpext <2 x half> %a.val to <2 x float>
208 store <2 x float> %r.val, ptr addrspace(1) %r
212 define amdgpu_kernel void @fpext_v2f16_to_v2f64(
213 ; SI-LABEL: fpext_v2f16_to_v2f64:
214 ; SI: ; %bb.0: ; %entry
215 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
216 ; SI-NEXT: s_mov_b32 s7, 0xf000
217 ; SI-NEXT: s_mov_b32 s6, -1
218 ; SI-NEXT: s_mov_b32 s10, s6
219 ; SI-NEXT: s_mov_b32 s11, s7
220 ; SI-NEXT: s_waitcnt lgkmcnt(0)
221 ; SI-NEXT: s_mov_b32 s8, s2
222 ; SI-NEXT: s_mov_b32 s9, s3
223 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
224 ; SI-NEXT: s_mov_b32 s4, s0
225 ; SI-NEXT: s_mov_b32 s5, s1
226 ; SI-NEXT: s_waitcnt vmcnt(0)
227 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
228 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
229 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
230 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
231 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
232 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
235 ; GFX89-LABEL: fpext_v2f16_to_v2f64:
236 ; GFX89: ; %bb.0: ; %entry
237 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
238 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
239 ; GFX89-NEXT: s_mov_b32 s6, -1
240 ; GFX89-NEXT: s_mov_b32 s10, s6
241 ; GFX89-NEXT: s_mov_b32 s11, s7
242 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX89-NEXT: s_mov_b32 s8, s2
244 ; GFX89-NEXT: s_mov_b32 s9, s3
245 ; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0
246 ; GFX89-NEXT: s_mov_b32 s4, s0
247 ; GFX89-NEXT: s_mov_b32 s5, s1
248 ; GFX89-NEXT: s_waitcnt vmcnt(0)
249 ; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
250 ; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
251 ; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
252 ; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
253 ; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
254 ; GFX89-NEXT: s_endpgm
256 ; GFX11-LABEL: fpext_v2f16_to_v2f64:
257 ; GFX11: ; %bb.0: ; %entry
258 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
259 ; GFX11-NEXT: s_mov_b32 s6, -1
260 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
261 ; GFX11-NEXT: s_mov_b32 s10, s6
262 ; GFX11-NEXT: s_mov_b32 s11, s7
263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
264 ; GFX11-NEXT: s_mov_b32 s8, s2
265 ; GFX11-NEXT: s_mov_b32 s9, s3
266 ; GFX11-NEXT: s_mov_b32 s4, s0
267 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
268 ; GFX11-NEXT: s_mov_b32 s5, s1
269 ; GFX11-NEXT: s_waitcnt vmcnt(0)
270 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
271 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
272 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
273 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
274 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
275 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
276 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
277 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
278 ; GFX11-NEXT: s_nop 0
279 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
280 ; GFX11-NEXT: s_endpgm
282 ptr addrspace(1) %a) {
284 %a.val = load <2 x half>, ptr addrspace(1) %a
285 %r.val = fpext <2 x half> %a.val to <2 x double>
286 store <2 x double> %r.val, ptr addrspace(1) %r
290 define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) {
291 ; SI-LABEL: s_fneg_fpext_f16_to_f32:
292 ; SI: ; %bb.0: ; %entry
293 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
294 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
295 ; SI-NEXT: s_mov_b32 s3, 0xf000
296 ; SI-NEXT: s_waitcnt lgkmcnt(0)
297 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
298 ; SI-NEXT: s_mov_b32 s2, -1
299 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
302 ; VI-LABEL: s_fneg_fpext_f16_to_f32:
303 ; VI: ; %bb.0: ; %entry
304 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
305 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
306 ; VI-NEXT: s_mov_b32 s3, 0xf000
307 ; VI-NEXT: s_waitcnt lgkmcnt(0)
308 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
309 ; VI-NEXT: s_mov_b32 s2, -1
310 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
313 ; GFX9-LABEL: s_fneg_fpext_f16_to_f32:
314 ; GFX9: ; %bb.0: ; %entry
315 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
316 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
317 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
318 ; GFX9-NEXT: s_mov_b32 s6, -1
319 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2
321 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
322 ; GFX9-NEXT: s_endpgm
324 ; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
325 ; GFX11: ; %bb.0: ; %entry
326 ; GFX11-NEXT: s_clause 0x1
327 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
328 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
329 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
330 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
332 ; GFX11-NEXT: s_mov_b32 s2, -1
333 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
334 ; GFX11-NEXT: s_nop 0
335 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
336 ; GFX11-NEXT: s_endpgm
338 %a.trunc = trunc i32 %a to i16
339 %a.val = bitcast i16 %a.trunc to half
340 %r.val = fpext half %a.val to float
341 store float %r.val, ptr addrspace(1) %r
345 define amdgpu_kernel void @fneg_fpext_f16_to_f32(
346 ; SI-LABEL: fneg_fpext_f16_to_f32:
347 ; SI: ; %bb.0: ; %entry
348 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
349 ; SI-NEXT: s_mov_b32 s7, 0xf000
350 ; SI-NEXT: s_mov_b32 s6, -1
351 ; SI-NEXT: s_mov_b32 s10, s6
352 ; SI-NEXT: s_mov_b32 s11, s7
353 ; SI-NEXT: s_waitcnt lgkmcnt(0)
354 ; SI-NEXT: s_mov_b32 s8, s2
355 ; SI-NEXT: s_mov_b32 s9, s3
356 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
357 ; SI-NEXT: s_mov_b32 s4, s0
358 ; SI-NEXT: s_mov_b32 s5, s1
359 ; SI-NEXT: s_waitcnt vmcnt(0)
360 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
361 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
364 ; GFX89-LABEL: fneg_fpext_f16_to_f32:
365 ; GFX89: ; %bb.0: ; %entry
366 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
367 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
368 ; GFX89-NEXT: s_mov_b32 s6, -1
369 ; GFX89-NEXT: s_mov_b32 s10, s6
370 ; GFX89-NEXT: s_mov_b32 s11, s7
371 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX89-NEXT: s_mov_b32 s8, s2
373 ; GFX89-NEXT: s_mov_b32 s9, s3
374 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
375 ; GFX89-NEXT: s_mov_b32 s4, s0
376 ; GFX89-NEXT: s_mov_b32 s5, s1
377 ; GFX89-NEXT: s_waitcnt vmcnt(0)
378 ; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0
379 ; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
380 ; GFX89-NEXT: s_endpgm
382 ; GFX11-LABEL: fneg_fpext_f16_to_f32:
383 ; GFX11: ; %bb.0: ; %entry
384 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
385 ; GFX11-NEXT: s_mov_b32 s6, -1
386 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
387 ; GFX11-NEXT: s_mov_b32 s10, s6
388 ; GFX11-NEXT: s_mov_b32 s11, s7
389 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
390 ; GFX11-NEXT: s_mov_b32 s8, s2
391 ; GFX11-NEXT: s_mov_b32 s9, s3
392 ; GFX11-NEXT: s_mov_b32 s4, s0
393 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
394 ; GFX11-NEXT: s_mov_b32 s5, s1
395 ; GFX11-NEXT: s_waitcnt vmcnt(0)
396 ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0
397 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
398 ; GFX11-NEXT: s_nop 0
399 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
400 ; GFX11-NEXT: s_endpgm
402 ptr addrspace(1) %a) {
404 %a.val = load half, ptr addrspace(1) %a
405 %a.neg = fsub half -0.0, %a.val
406 %r.val = fpext half %a.neg to float
407 store float %r.val, ptr addrspace(1) %r
411 define amdgpu_kernel void @fabs_fpext_f16_to_f32(
412 ; SI-LABEL: fabs_fpext_f16_to_f32:
413 ; SI: ; %bb.0: ; %entry
414 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
415 ; SI-NEXT: s_mov_b32 s7, 0xf000
416 ; SI-NEXT: s_mov_b32 s6, -1
417 ; SI-NEXT: s_mov_b32 s10, s6
418 ; SI-NEXT: s_mov_b32 s11, s7
419 ; SI-NEXT: s_waitcnt lgkmcnt(0)
420 ; SI-NEXT: s_mov_b32 s8, s2
421 ; SI-NEXT: s_mov_b32 s9, s3
422 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
423 ; SI-NEXT: s_mov_b32 s4, s0
424 ; SI-NEXT: s_mov_b32 s5, s1
425 ; SI-NEXT: s_waitcnt vmcnt(0)
426 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
427 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
430 ; GFX89-LABEL: fabs_fpext_f16_to_f32:
431 ; GFX89: ; %bb.0: ; %entry
432 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
433 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
434 ; GFX89-NEXT: s_mov_b32 s6, -1
435 ; GFX89-NEXT: s_mov_b32 s10, s6
436 ; GFX89-NEXT: s_mov_b32 s11, s7
437 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX89-NEXT: s_mov_b32 s8, s2
439 ; GFX89-NEXT: s_mov_b32 s9, s3
440 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
441 ; GFX89-NEXT: s_mov_b32 s4, s0
442 ; GFX89-NEXT: s_mov_b32 s5, s1
443 ; GFX89-NEXT: s_waitcnt vmcnt(0)
444 ; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0|
445 ; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
446 ; GFX89-NEXT: s_endpgm
448 ; GFX11-LABEL: fabs_fpext_f16_to_f32:
449 ; GFX11: ; %bb.0: ; %entry
450 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
451 ; GFX11-NEXT: s_mov_b32 s6, -1
452 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
453 ; GFX11-NEXT: s_mov_b32 s10, s6
454 ; GFX11-NEXT: s_mov_b32 s11, s7
455 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
456 ; GFX11-NEXT: s_mov_b32 s8, s2
457 ; GFX11-NEXT: s_mov_b32 s9, s3
458 ; GFX11-NEXT: s_mov_b32 s4, s0
459 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
460 ; GFX11-NEXT: s_mov_b32 s5, s1
461 ; GFX11-NEXT: s_waitcnt vmcnt(0)
462 ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0|
463 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
464 ; GFX11-NEXT: s_nop 0
465 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
466 ; GFX11-NEXT: s_endpgm
468 ptr addrspace(1) %a) {
470 %a.val = load half, ptr addrspace(1) %a
471 %a.fabs = call half @llvm.fabs.f16(half %a.val)
472 %r.val = fpext half %a.fabs to float
473 store float %r.val, ptr addrspace(1) %r
477 define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
478 ; SI-LABEL: fneg_fabs_fpext_f16_to_f32:
479 ; SI: ; %bb.0: ; %entry
480 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
481 ; SI-NEXT: s_mov_b32 s7, 0xf000
482 ; SI-NEXT: s_mov_b32 s6, -1
483 ; SI-NEXT: s_mov_b32 s10, s6
484 ; SI-NEXT: s_mov_b32 s11, s7
485 ; SI-NEXT: s_waitcnt lgkmcnt(0)
486 ; SI-NEXT: s_mov_b32 s8, s2
487 ; SI-NEXT: s_mov_b32 s9, s3
488 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
489 ; SI-NEXT: s_mov_b32 s4, s0
490 ; SI-NEXT: s_mov_b32 s5, s1
491 ; SI-NEXT: s_waitcnt vmcnt(0)
492 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
493 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
496 ; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
497 ; GFX89: ; %bb.0: ; %entry
498 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
499 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
500 ; GFX89-NEXT: s_mov_b32 s6, -1
501 ; GFX89-NEXT: s_mov_b32 s10, s6
502 ; GFX89-NEXT: s_mov_b32 s11, s7
503 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
504 ; GFX89-NEXT: s_mov_b32 s8, s2
505 ; GFX89-NEXT: s_mov_b32 s9, s3
506 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
507 ; GFX89-NEXT: s_mov_b32 s4, s0
508 ; GFX89-NEXT: s_mov_b32 s5, s1
509 ; GFX89-NEXT: s_waitcnt vmcnt(0)
510 ; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
511 ; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
512 ; GFX89-NEXT: s_endpgm
514 ; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
515 ; GFX11: ; %bb.0: ; %entry
516 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
517 ; GFX11-NEXT: s_mov_b32 s6, -1
518 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
519 ; GFX11-NEXT: s_mov_b32 s10, s6
520 ; GFX11-NEXT: s_mov_b32 s11, s7
521 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
522 ; GFX11-NEXT: s_mov_b32 s8, s2
523 ; GFX11-NEXT: s_mov_b32 s9, s3
524 ; GFX11-NEXT: s_mov_b32 s4, s0
525 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
526 ; GFX11-NEXT: s_mov_b32 s5, s1
527 ; GFX11-NEXT: s_waitcnt vmcnt(0)
528 ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
529 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
530 ; GFX11-NEXT: s_nop 0
531 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
532 ; GFX11-NEXT: s_endpgm
534 ptr addrspace(1) %a) {
536 %a.val = load half, ptr addrspace(1) %a
537 %a.fabs = call half @llvm.fabs.f16(half %a.val)
538 %a.fneg.fabs = fsub half -0.0, %a.fabs
539 %r.val = fpext half %a.fneg.fabs to float
540 store float %r.val, ptr addrspace(1) %r
544 ; FIXME: Using the source modifier here only wastes code size
546 define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
547 ; SI-LABEL: fneg_multi_use_fpext_f16_to_f32:
548 ; SI: ; %bb.0: ; %entry
549 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
550 ; SI-NEXT: s_mov_b32 s7, 0xf000
551 ; SI-NEXT: s_mov_b32 s6, -1
552 ; SI-NEXT: s_mov_b32 s10, s6
553 ; SI-NEXT: s_mov_b32 s11, s7
554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
555 ; SI-NEXT: s_mov_b32 s8, s2
556 ; SI-NEXT: s_mov_b32 s9, s3
557 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
558 ; SI-NEXT: s_mov_b32 s4, s0
559 ; SI-NEXT: s_mov_b32 s5, s1
560 ; SI-NEXT: s_waitcnt vmcnt(0)
561 ; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
562 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
563 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
564 ; SI-NEXT: s_waitcnt vmcnt(0)
565 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
566 ; SI-NEXT: s_waitcnt vmcnt(0)
569 ; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
570 ; GFX89: ; %bb.0: ; %entry
571 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
572 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
573 ; GFX89-NEXT: s_mov_b32 s6, -1
574 ; GFX89-NEXT: s_mov_b32 s10, s6
575 ; GFX89-NEXT: s_mov_b32 s11, s7
576 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX89-NEXT: s_mov_b32 s8, s2
578 ; GFX89-NEXT: s_mov_b32 s9, s3
579 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
580 ; GFX89-NEXT: s_mov_b32 s4, s0
581 ; GFX89-NEXT: s_mov_b32 s5, s1
582 ; GFX89-NEXT: s_waitcnt vmcnt(0)
583 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
584 ; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0
585 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
586 ; GFX89-NEXT: s_waitcnt vmcnt(0)
587 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
588 ; GFX89-NEXT: s_waitcnt vmcnt(0)
589 ; GFX89-NEXT: s_endpgm
591 ; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
592 ; GFX11: ; %bb.0: ; %entry
593 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
594 ; GFX11-NEXT: s_mov_b32 s6, -1
595 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
596 ; GFX11-NEXT: s_mov_b32 s10, s6
597 ; GFX11-NEXT: s_mov_b32 s11, s7
598 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX11-NEXT: s_mov_b32 s8, s2
600 ; GFX11-NEXT: s_mov_b32 s9, s3
601 ; GFX11-NEXT: s_mov_b32 s4, s0
602 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
603 ; GFX11-NEXT: s_mov_b32 s5, s1
604 ; GFX11-NEXT: s_waitcnt vmcnt(0)
605 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
606 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
607 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
608 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
609 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
610 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
611 ; GFX11-NEXT: s_nop 0
612 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
613 ; GFX11-NEXT: s_endpgm
615 ptr addrspace(1) %a) {
617 %a.val = load half, ptr addrspace(1) %a
618 %a.neg = fsub half -0.0, %a.val
619 %r.val = fpext half %a.neg to float
620 store volatile float %r.val, ptr addrspace(1) %r
621 store volatile half %a.neg, ptr addrspace(1) undef
625 define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
626 ; SI-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
627 ; SI: ; %bb.0: ; %entry
628 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
629 ; SI-NEXT: s_mov_b32 s7, 0xf000
630 ; SI-NEXT: s_mov_b32 s6, -1
631 ; SI-NEXT: s_mov_b32 s10, s6
632 ; SI-NEXT: s_mov_b32 s11, s7
633 ; SI-NEXT: s_waitcnt lgkmcnt(0)
634 ; SI-NEXT: s_mov_b32 s8, s2
635 ; SI-NEXT: s_mov_b32 s9, s3
636 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
637 ; SI-NEXT: s_mov_b32 s4, s0
638 ; SI-NEXT: s_mov_b32 s5, s1
639 ; SI-NEXT: s_waitcnt vmcnt(0)
640 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
641 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
642 ; SI-NEXT: v_mul_f32_e32 v1, v0, v1
643 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
644 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
645 ; SI-NEXT: s_waitcnt vmcnt(0)
646 ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
647 ; SI-NEXT: s_waitcnt vmcnt(0)
650 ; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
651 ; GFX89: ; %bb.0: ; %entry
652 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
653 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
654 ; GFX89-NEXT: s_mov_b32 s6, -1
655 ; GFX89-NEXT: s_mov_b32 s10, s6
656 ; GFX89-NEXT: s_mov_b32 s11, s7
657 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
658 ; GFX89-NEXT: s_mov_b32 s8, s2
659 ; GFX89-NEXT: s_mov_b32 s9, s3
660 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
661 ; GFX89-NEXT: s_mov_b32 s4, s0
662 ; GFX89-NEXT: s_mov_b32 s5, s1
663 ; GFX89-NEXT: s_waitcnt vmcnt(0)
664 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
665 ; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0
666 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
667 ; GFX89-NEXT: s_waitcnt vmcnt(0)
668 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
669 ; GFX89-NEXT: s_waitcnt vmcnt(0)
670 ; GFX89-NEXT: s_endpgm
672 ; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
673 ; GFX11: ; %bb.0: ; %entry
674 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
675 ; GFX11-NEXT: s_mov_b32 s6, -1
676 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
677 ; GFX11-NEXT: s_mov_b32 s10, s6
678 ; GFX11-NEXT: s_mov_b32 s11, s7
679 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX11-NEXT: s_mov_b32 s8, s2
681 ; GFX11-NEXT: s_mov_b32 s9, s3
682 ; GFX11-NEXT: s_mov_b32 s4, s0
683 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
684 ; GFX11-NEXT: s_mov_b32 s5, s1
685 ; GFX11-NEXT: s_waitcnt vmcnt(0)
686 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
687 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0
688 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
689 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
690 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
691 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
692 ; GFX11-NEXT: s_nop 0
693 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
694 ; GFX11-NEXT: s_endpgm
696 ptr addrspace(1) %a) {
698 %a.val = load half, ptr addrspace(1) %a
699 %a.neg = fsub half -0.0, %a.val
700 %r.val = fpext half %a.neg to float
701 %mul = fmul half %a.neg, %a.val
702 store volatile float %r.val, ptr addrspace(1) %r
703 store volatile half %mul, ptr addrspace(1) undef
707 define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
708 ; SI-LABEL: fabs_multi_use_fpext_f16_to_f32:
709 ; SI: ; %bb.0: ; %entry
710 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
711 ; SI-NEXT: s_mov_b32 s7, 0xf000
712 ; SI-NEXT: s_mov_b32 s6, -1
713 ; SI-NEXT: s_mov_b32 s10, s6
714 ; SI-NEXT: s_mov_b32 s11, s7
715 ; SI-NEXT: s_waitcnt lgkmcnt(0)
716 ; SI-NEXT: s_mov_b32 s8, s2
717 ; SI-NEXT: s_mov_b32 s9, s3
718 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
719 ; SI-NEXT: s_mov_b32 s4, s0
720 ; SI-NEXT: s_mov_b32 s5, s1
721 ; SI-NEXT: s_waitcnt vmcnt(0)
722 ; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
723 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
724 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
725 ; SI-NEXT: s_waitcnt vmcnt(0)
726 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
727 ; SI-NEXT: s_waitcnt vmcnt(0)
730 ; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
731 ; GFX89: ; %bb.0: ; %entry
732 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
733 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
734 ; GFX89-NEXT: s_mov_b32 s6, -1
735 ; GFX89-NEXT: s_mov_b32 s10, s6
736 ; GFX89-NEXT: s_mov_b32 s11, s7
737 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
738 ; GFX89-NEXT: s_mov_b32 s8, s2
739 ; GFX89-NEXT: s_mov_b32 s9, s3
740 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
741 ; GFX89-NEXT: s_mov_b32 s4, s0
742 ; GFX89-NEXT: s_mov_b32 s5, s1
743 ; GFX89-NEXT: s_waitcnt vmcnt(0)
744 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
745 ; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0
746 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
747 ; GFX89-NEXT: s_waitcnt vmcnt(0)
748 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
749 ; GFX89-NEXT: s_waitcnt vmcnt(0)
750 ; GFX89-NEXT: s_endpgm
752 ; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
753 ; GFX11: ; %bb.0: ; %entry
754 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
755 ; GFX11-NEXT: s_mov_b32 s6, -1
756 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
757 ; GFX11-NEXT: s_mov_b32 s10, s6
758 ; GFX11-NEXT: s_mov_b32 s11, s7
759 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
760 ; GFX11-NEXT: s_mov_b32 s8, s2
761 ; GFX11-NEXT: s_mov_b32 s9, s3
762 ; GFX11-NEXT: s_mov_b32 s4, s0
763 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
764 ; GFX11-NEXT: s_mov_b32 s5, s1
765 ; GFX11-NEXT: s_waitcnt vmcnt(0)
766 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
767 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
768 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
769 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
770 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
771 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
772 ; GFX11-NEXT: s_nop 0
773 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
774 ; GFX11-NEXT: s_endpgm
776 ptr addrspace(1) %a) {
778 %a.val = load half, ptr addrspace(1) %a
779 %a.fabs = call half @llvm.fabs.f16(half %a.val)
780 %r.val = fpext half %a.fabs to float
781 store volatile float %r.val, ptr addrspace(1) %r
782 store volatile half %a.fabs, ptr addrspace(1) undef
786 define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
787 ; SI-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
788 ; SI: ; %bb.0: ; %entry
789 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
790 ; SI-NEXT: s_mov_b32 s7, 0xf000
791 ; SI-NEXT: s_mov_b32 s6, -1
792 ; SI-NEXT: s_mov_b32 s10, s6
793 ; SI-NEXT: s_mov_b32 s11, s7
794 ; SI-NEXT: s_waitcnt lgkmcnt(0)
795 ; SI-NEXT: s_mov_b32 s8, s2
796 ; SI-NEXT: s_mov_b32 s9, s3
797 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
798 ; SI-NEXT: s_mov_b32 s4, s0
799 ; SI-NEXT: s_mov_b32 s5, s1
800 ; SI-NEXT: s_waitcnt vmcnt(0)
801 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
802 ; SI-NEXT: v_mul_f32_e64 v1, |v0|, v0
803 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
804 ; SI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
805 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
806 ; SI-NEXT: s_waitcnt vmcnt(0)
807 ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
808 ; SI-NEXT: s_waitcnt vmcnt(0)
811 ; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
812 ; GFX89: ; %bb.0: ; %entry
813 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
814 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
815 ; GFX89-NEXT: s_mov_b32 s6, -1
816 ; GFX89-NEXT: s_mov_b32 s10, s6
817 ; GFX89-NEXT: s_mov_b32 s11, s7
818 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
819 ; GFX89-NEXT: s_mov_b32 s8, s2
820 ; GFX89-NEXT: s_mov_b32 s9, s3
821 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
822 ; GFX89-NEXT: s_mov_b32 s4, s0
823 ; GFX89-NEXT: s_mov_b32 s5, s1
824 ; GFX89-NEXT: s_waitcnt vmcnt(0)
825 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
826 ; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0
827 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
828 ; GFX89-NEXT: s_waitcnt vmcnt(0)
829 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
830 ; GFX89-NEXT: s_waitcnt vmcnt(0)
831 ; GFX89-NEXT: s_endpgm
833 ; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
834 ; GFX11: ; %bb.0: ; %entry
835 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
836 ; GFX11-NEXT: s_mov_b32 s6, -1
837 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
838 ; GFX11-NEXT: s_mov_b32 s10, s6
839 ; GFX11-NEXT: s_mov_b32 s11, s7
840 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
841 ; GFX11-NEXT: s_mov_b32 s8, s2
842 ; GFX11-NEXT: s_mov_b32 s9, s3
843 ; GFX11-NEXT: s_mov_b32 s4, s0
844 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
845 ; GFX11-NEXT: s_mov_b32 s5, s1
846 ; GFX11-NEXT: s_waitcnt vmcnt(0)
847 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
848 ; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0
849 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
850 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
851 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
852 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
853 ; GFX11-NEXT: s_nop 0
854 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
855 ; GFX11-NEXT: s_endpgm
857 ptr addrspace(1) %a) {
859 %a.val = load half, ptr addrspace(1) %a
860 %a.fabs = call half @llvm.fabs.f16(half %a.val)
861 %r.val = fpext half %a.fabs to float
862 %mul = fmul half %a.fabs, %a.val
863 store volatile float %r.val, ptr addrspace(1) %r
864 store volatile half %mul, ptr addrspace(1) undef
868 define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
869 ; SI-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
870 ; SI: ; %bb.0: ; %entry
871 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
872 ; SI-NEXT: s_mov_b32 s7, 0xf000
873 ; SI-NEXT: s_mov_b32 s6, -1
874 ; SI-NEXT: s_mov_b32 s10, s6
875 ; SI-NEXT: s_mov_b32 s11, s7
876 ; SI-NEXT: s_waitcnt lgkmcnt(0)
877 ; SI-NEXT: s_mov_b32 s8, s2
878 ; SI-NEXT: s_mov_b32 s9, s3
879 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
880 ; SI-NEXT: s_mov_b32 s4, s0
881 ; SI-NEXT: s_mov_b32 s5, s1
882 ; SI-NEXT: s_waitcnt vmcnt(0)
883 ; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0
884 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
885 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
886 ; SI-NEXT: s_waitcnt vmcnt(0)
887 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
888 ; SI-NEXT: s_waitcnt vmcnt(0)
891 ; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
892 ; GFX89: ; %bb.0: ; %entry
893 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
894 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
895 ; GFX89-NEXT: s_mov_b32 s6, -1
896 ; GFX89-NEXT: s_mov_b32 s10, s6
897 ; GFX89-NEXT: s_mov_b32 s11, s7
898 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
899 ; GFX89-NEXT: s_mov_b32 s8, s2
900 ; GFX89-NEXT: s_mov_b32 s9, s3
901 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
902 ; GFX89-NEXT: s_mov_b32 s4, s0
903 ; GFX89-NEXT: s_mov_b32 s5, s1
904 ; GFX89-NEXT: s_waitcnt vmcnt(0)
905 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
906 ; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0
907 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
908 ; GFX89-NEXT: s_waitcnt vmcnt(0)
909 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
910 ; GFX89-NEXT: s_waitcnt vmcnt(0)
911 ; GFX89-NEXT: s_endpgm
913 ; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
914 ; GFX11: ; %bb.0: ; %entry
915 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
916 ; GFX11-NEXT: s_mov_b32 s6, -1
917 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
918 ; GFX11-NEXT: s_mov_b32 s10, s6
919 ; GFX11-NEXT: s_mov_b32 s11, s7
920 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
921 ; GFX11-NEXT: s_mov_b32 s8, s2
922 ; GFX11-NEXT: s_mov_b32 s9, s3
923 ; GFX11-NEXT: s_mov_b32 s4, s0
924 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
925 ; GFX11-NEXT: s_mov_b32 s5, s1
926 ; GFX11-NEXT: s_waitcnt vmcnt(0)
927 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
928 ; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
929 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
930 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
931 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
932 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
933 ; GFX11-NEXT: s_nop 0
934 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
935 ; GFX11-NEXT: s_endpgm
937 ptr addrspace(1) %a) {
939 %a.val = load half, ptr addrspace(1) %a
940 %a.fabs = call half @llvm.fabs.f16(half %a.val)
941 %a.fneg.fabs = fsub half -0.0, %a.fabs
942 %r.val = fpext half %a.fneg.fabs to float
943 store volatile float %r.val, ptr addrspace(1) %r
944 store volatile half %a.fneg.fabs, ptr addrspace(1) undef
948 define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
949 ; SI-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
950 ; SI: ; %bb.0: ; %entry
951 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
952 ; SI-NEXT: s_mov_b32 s7, 0xf000
953 ; SI-NEXT: s_mov_b32 s6, -1
954 ; SI-NEXT: s_mov_b32 s10, s6
955 ; SI-NEXT: s_mov_b32 s11, s7
956 ; SI-NEXT: s_waitcnt lgkmcnt(0)
957 ; SI-NEXT: s_mov_b32 s8, s2
958 ; SI-NEXT: s_mov_b32 s9, s3
959 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
960 ; SI-NEXT: s_mov_b32 s4, s0
961 ; SI-NEXT: s_mov_b32 s5, s1
962 ; SI-NEXT: s_waitcnt vmcnt(0)
963 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
964 ; SI-NEXT: v_mul_f32_e64 v1, -|v0|, v0
965 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
966 ; SI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
967 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
968 ; SI-NEXT: s_waitcnt vmcnt(0)
969 ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
970 ; SI-NEXT: s_waitcnt vmcnt(0)
973 ; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
974 ; GFX89: ; %bb.0: ; %entry
975 ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
976 ; GFX89-NEXT: s_mov_b32 s7, 0xf000
977 ; GFX89-NEXT: s_mov_b32 s6, -1
978 ; GFX89-NEXT: s_mov_b32 s10, s6
979 ; GFX89-NEXT: s_mov_b32 s11, s7
980 ; GFX89-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX89-NEXT: s_mov_b32 s8, s2
982 ; GFX89-NEXT: s_mov_b32 s9, s3
983 ; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
984 ; GFX89-NEXT: s_mov_b32 s4, s0
985 ; GFX89-NEXT: s_mov_b32 s5, s1
986 ; GFX89-NEXT: s_waitcnt vmcnt(0)
987 ; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
988 ; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0
989 ; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
990 ; GFX89-NEXT: s_waitcnt vmcnt(0)
991 ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
992 ; GFX89-NEXT: s_waitcnt vmcnt(0)
993 ; GFX89-NEXT: s_endpgm
995 ; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
996 ; GFX11: ; %bb.0: ; %entry
997 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
998 ; GFX11-NEXT: s_mov_b32 s6, -1
999 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1000 ; GFX11-NEXT: s_mov_b32 s10, s6
1001 ; GFX11-NEXT: s_mov_b32 s11, s7
1002 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1003 ; GFX11-NEXT: s_mov_b32 s8, s2
1004 ; GFX11-NEXT: s_mov_b32 s9, s3
1005 ; GFX11-NEXT: s_mov_b32 s4, s0
1006 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
1007 ; GFX11-NEXT: s_mov_b32 s5, s1
1008 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1009 ; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
1010 ; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0
1011 ; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
1012 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1013 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
1014 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1015 ; GFX11-NEXT: s_nop 0
1016 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1017 ; GFX11-NEXT: s_endpgm
1018 ptr addrspace(1) %r,
1019 ptr addrspace(1) %a) {
1021 %a.val = load half, ptr addrspace(1) %a
1022 %a.fabs = call half @llvm.fabs.f16(half %a.val)
1023 %a.fneg.fabs = fsub half -0.0, %a.fabs
1024 %r.val = fpext half %a.fneg.fabs to float
1025 %mul = fmul half %a.fneg.fabs, %a.val
1026 store volatile float %r.val, ptr addrspace(1) %r
1027 store volatile half %mul, ptr addrspace(1) undef
1031 declare half @llvm.fabs.f16(half) #1
1033 attributes #1 = { nounwind readnone }