1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-DENORM %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-FLUSH %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-DENORM %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-DENORM %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FLUSH %s
9 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
10 ; make add an instruction if the fadd has more than one use.
12 declare half @llvm.fabs.f16(half) #1
13 declare float @llvm.fabs.f32(float) #1
15 define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 {
16 ; VI-LABEL: multiple_fadd_use_test_f32:
18 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
19 ; VI-NEXT: s_waitcnt lgkmcnt(0)
20 ; VI-NEXT: v_add_f32_e64 v0, s3, -1.0
21 ; VI-NEXT: v_add_f32_e64 v1, s2, -1.0
22 ; VI-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, |v1|
23 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
24 ; VI-NEXT: v_add_f32_e64 v0, |v0|, |v0|
25 ; VI-NEXT: v_mul_f32_e32 v1, v0, v0
26 ; VI-NEXT: v_mad_f32 v2, -v1, v0, 1.0
27 ; VI-NEXT: v_mov_b32_e32 v0, s0
28 ; VI-NEXT: v_mov_b32_e32 v1, s1
29 ; VI-NEXT: flat_store_dword v[0:1], v2
32 ; GFX10-LABEL: multiple_fadd_use_test_f32:
34 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
35 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
36 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX10-NEXT: v_add_f32_e64 v0, s3, -1.0
38 ; GFX10-NEXT: v_add_f32_e64 v1, s2, -1.0
39 ; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1|
40 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
41 ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, |v0|
42 ; GFX10-NEXT: v_mul_f32_e32 v1, v0, v0
43 ; GFX10-NEXT: v_fma_f32 v0, -v1, v0, 1.0
44 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
45 ; GFX10-NEXT: s_endpgm
47 ; GFX11-LABEL: multiple_fadd_use_test_f32:
49 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
50 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
51 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
52 ; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0
53 ; GFX11-NEXT: v_add_f32_e64 v1, s2, -1.0
54 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
55 ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1|
56 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
57 ; GFX11-NEXT: v_add_f32_e64 v0, |v0|, |v0|
58 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
59 ; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0
60 ; GFX11-NEXT: v_fma_f32 v0, -v1, v0, 1.0
61 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
63 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
64 ; GFX11-NEXT: s_endpgm
65 %a11 = fadd float %y, -1.0
66 %a12 = call float @llvm.fabs.f32(float %a11)
67 %a13 = fadd float %x, -1.0
68 %a14 = call float @llvm.fabs.f32(float %a13)
69 %a15 = fcmp ogt float %a12, %a14
70 %a16 = select i1 %a15, float %a12, float %a14
71 %a17 = fmul float %a16, 2.0
72 %a18 = fmul float %a17, %a17
73 %a19 = fmul float %a18, %a17
74 %a20 = fsub float 1.0, %a19
75 store float %a20, ptr addrspace(1) %out
79 define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, float %x, [8 x i32], float %y) #0 {
80 ; VI-LABEL: multiple_use_fadd_fmac_f32:
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
83 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8
84 ; VI-NEXT: s_load_dword s3, s[4:5], 0x2c
85 ; VI-NEXT: s_waitcnt lgkmcnt(0)
86 ; VI-NEXT: v_mov_b32_e32 v0, s0
87 ; VI-NEXT: s_add_u32 s2, s0, 4
88 ; VI-NEXT: v_add_f32_e64 v2, s6, s6
89 ; VI-NEXT: v_mov_b32_e32 v1, s1
90 ; VI-NEXT: v_mov_b32_e32 v3, s3
91 ; VI-NEXT: s_addc_u32 s3, s1, 0
92 ; VI-NEXT: flat_store_dword v[0:1], v2
93 ; VI-NEXT: s_waitcnt vmcnt(0)
94 ; VI-NEXT: v_mov_b32_e32 v0, s2
95 ; VI-NEXT: v_mac_f32_e64 v3, s6, 2.0
96 ; VI-NEXT: v_mov_b32_e32 v1, s3
97 ; VI-NEXT: flat_store_dword v[0:1], v3
98 ; VI-NEXT: s_waitcnt vmcnt(0)
101 ; GFX10-LABEL: multiple_use_fadd_fmac_f32:
103 ; GFX10-NEXT: s_clause 0x2
104 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
105 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x2c
106 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
107 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
108 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX10-NEXT: v_add_f32_e64 v1, s2, s2
110 ; GFX10-NEXT: v_fma_f32 v2, s2, 2.0, s3
111 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
112 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
113 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4
114 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
115 ; GFX10-NEXT: s_endpgm
117 ; GFX11-LABEL: multiple_use_fadd_fmac_f32:
119 ; GFX11-NEXT: s_clause 0x2
120 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
121 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c
122 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
123 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
124 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX11-NEXT: v_add_f32_e64 v1, s2, s2
126 ; GFX11-NEXT: v_fma_f32 v2, s2, 2.0, s3
127 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
128 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
129 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc
130 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
131 ; GFX11-NEXT: s_nop 0
132 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
133 ; GFX11-NEXT: s_endpgm
134 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
135 %mul2 = fmul fast float %x, 2.0
136 %mad = fadd fast float %mul2, %y
137 store volatile float %mul2, ptr addrspace(1) %out
138 store volatile float %mad, ptr addrspace(1) %out.gep.1
142 define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
143 ; VI-LABEL: multiple_use_fadd_fmad_f32:
145 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
146 ; VI-NEXT: s_waitcnt lgkmcnt(0)
147 ; VI-NEXT: v_mov_b32_e32 v0, s0
148 ; VI-NEXT: s_add_u32 s4, s0, 4
149 ; VI-NEXT: v_add_f32_e64 v2, |s2|, |s2|
150 ; VI-NEXT: v_mov_b32_e32 v1, s1
151 ; VI-NEXT: v_mov_b32_e32 v3, s3
152 ; VI-NEXT: s_addc_u32 s5, s1, 0
153 ; VI-NEXT: flat_store_dword v[0:1], v2
154 ; VI-NEXT: s_waitcnt vmcnt(0)
155 ; VI-NEXT: v_mov_b32_e32 v0, s4
156 ; VI-NEXT: v_mad_f32 v3, |s2|, 2.0, v3
157 ; VI-NEXT: v_mov_b32_e32 v1, s5
158 ; VI-NEXT: flat_store_dword v[0:1], v3
159 ; VI-NEXT: s_waitcnt vmcnt(0)
162 ; GFX10-LABEL: multiple_use_fadd_fmad_f32:
164 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
165 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
166 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX10-NEXT: v_add_f32_e64 v1, |s2|, |s2|
168 ; GFX10-NEXT: v_fma_f32 v2, |s2|, 2.0, s3
169 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
170 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
171 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] offset:4
172 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
173 ; GFX10-NEXT: s_endpgm
175 ; GFX11-LABEL: multiple_use_fadd_fmad_f32:
177 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
178 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
179 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2|
181 ; GFX11-NEXT: v_fma_f32 v2, |s2|, 2.0, s3
182 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
183 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
184 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc
185 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
186 ; GFX11-NEXT: s_nop 0
187 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
188 ; GFX11-NEXT: s_endpgm
189 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
190 %x.abs = call float @llvm.fabs.f32(float %x)
191 %mul2 = fmul fast float %x.abs, 2.0
192 %mad = fadd fast float %mul2, %y
193 store volatile float %mul2, ptr addrspace(1) %out
194 store volatile float %mad, ptr addrspace(1) %out.gep.1
198 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %out, float %x, float %y, float %z) #0 {
199 ; VI-LABEL: multiple_use_fadd_multi_fmad_f32:
201 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
202 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
203 ; VI-NEXT: s_waitcnt lgkmcnt(0)
204 ; VI-NEXT: s_add_u32 s4, s6, 4
205 ; VI-NEXT: v_mov_b32_e32 v0, s1
206 ; VI-NEXT: v_mov_b32_e32 v1, s2
207 ; VI-NEXT: v_mad_f32 v2, |s0|, 2.0, v0
208 ; VI-NEXT: v_mad_f32 v3, |s0|, 2.0, v1
209 ; VI-NEXT: v_mov_b32_e32 v0, s6
210 ; VI-NEXT: v_mov_b32_e32 v1, s7
211 ; VI-NEXT: s_addc_u32 s5, s7, 0
212 ; VI-NEXT: flat_store_dword v[0:1], v2
213 ; VI-NEXT: s_waitcnt vmcnt(0)
214 ; VI-NEXT: v_mov_b32_e32 v0, s4
215 ; VI-NEXT: v_mov_b32_e32 v1, s5
216 ; VI-NEXT: flat_store_dword v[0:1], v3
217 ; VI-NEXT: s_waitcnt vmcnt(0)
220 ; GFX10-LABEL: multiple_use_fadd_multi_fmad_f32:
222 ; GFX10-NEXT: s_clause 0x1
223 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
224 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
225 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
226 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX10-NEXT: v_fma_f32 v1, |s0|, 2.0, s1
228 ; GFX10-NEXT: v_fma_f32 v2, |s0|, 2.0, s2
229 ; GFX10-NEXT: global_store_dword v0, v1, s[6:7]
230 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
231 ; GFX10-NEXT: global_store_dword v0, v2, s[6:7] offset:4
232 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
233 ; GFX10-NEXT: s_endpgm
235 ; GFX11-LABEL: multiple_use_fadd_multi_fmad_f32:
237 ; GFX11-NEXT: s_clause 0x1
238 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8
239 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
240 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
241 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX11-NEXT: v_fma_f32 v1, |s4|, 2.0, s5
243 ; GFX11-NEXT: v_fma_f32 v2, |s4|, 2.0, s6
244 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
245 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
246 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc
247 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
248 ; GFX11-NEXT: s_nop 0
249 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
250 ; GFX11-NEXT: s_endpgm
251 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
252 %x.abs = call float @llvm.fabs.f32(float %x)
253 %mul2 = fmul fast float %x.abs, 2.0
254 %mad0 = fadd fast float %mul2, %y
255 %mad1 = fadd fast float %mul2, %z
256 store volatile float %mad0, ptr addrspace(1) %out
257 store volatile float %mad1, ptr addrspace(1) %out.gep.1
261 define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
262 ; VI-LABEL: fmul_x2_xn2_f32:
264 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
265 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
266 ; VI-NEXT: s_waitcnt lgkmcnt(0)
267 ; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0
268 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0
269 ; VI-NEXT: v_mov_b32_e32 v0, s0
270 ; VI-NEXT: v_mov_b32_e32 v1, s1
271 ; VI-NEXT: flat_store_dword v[0:1], v2
272 ; VI-NEXT: s_waitcnt vmcnt(0)
275 ; GFX10-LABEL: fmul_x2_xn2_f32:
277 ; GFX10-NEXT: s_clause 0x1
278 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
279 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
280 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
281 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
282 ; GFX10-NEXT: v_mul_f32_e64 v0, s2, -4.0
283 ; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
284 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
285 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
286 ; GFX10-NEXT: s_endpgm
288 ; GFX11-LABEL: fmul_x2_xn2_f32:
290 ; GFX11-NEXT: s_clause 0x1
291 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
292 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
293 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX11-NEXT: v_mul_f32_e64 v0, s2, -4.0
295 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
296 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
297 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc
298 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
299 ; GFX11-NEXT: s_nop 0
300 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
301 ; GFX11-NEXT: s_endpgm
302 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
303 %mul2 = fmul fast float %x, 2.0
304 %muln2 = fmul fast float %x, -2.0
305 %mul = fmul fast float %mul2, %muln2
306 store volatile float %mul, ptr addrspace(1) %out
310 define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, float %y) #0 {
311 ; VI-LABEL: fmul_x2_xn3_f32:
313 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
314 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
315 ; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000
316 ; VI-NEXT: s_waitcnt lgkmcnt(0)
317 ; VI-NEXT: v_mul_f32_e32 v0, s2, v0
318 ; VI-NEXT: v_mul_f32_e32 v2, s2, v0
319 ; VI-NEXT: v_mov_b32_e32 v0, s0
320 ; VI-NEXT: v_mov_b32_e32 v1, s1
321 ; VI-NEXT: flat_store_dword v[0:1], v2
322 ; VI-NEXT: s_waitcnt vmcnt(0)
325 ; GFX10-LABEL: fmul_x2_xn3_f32:
327 ; GFX10-NEXT: s_clause 0x1
328 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
329 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
330 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
331 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX10-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2
333 ; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0
334 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
335 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
336 ; GFX10-NEXT: s_endpgm
338 ; GFX11-LABEL: fmul_x2_xn3_f32:
340 ; GFX11-NEXT: s_clause 0x1
341 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
342 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
343 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
344 ; GFX11-NEXT: v_mul_f32_e64 v0, 0xc0c00000, s2
345 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
346 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0
347 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] dlc
348 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
349 ; GFX11-NEXT: s_nop 0
350 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
351 ; GFX11-NEXT: s_endpgm
352 %out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
353 %mul2 = fmul fast float %x, 2.0
354 %muln2 = fmul fast float %x, -3.0
355 %mul = fmul fast float %mul2, %muln2
356 store volatile float %mul, ptr addrspace(1) %out
360 define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
361 ; VI-DENORM-LABEL: multiple_fadd_use_test_f16:
362 ; VI-DENORM: ; %bb.0:
363 ; VI-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8
364 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
365 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
366 ; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16
367 ; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
368 ; VI-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0
369 ; VI-DENORM-NEXT: v_cmp_gt_f16_e64 vcc, |v1|, |v0|
370 ; VI-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
371 ; VI-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0|
372 ; VI-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0
373 ; VI-DENORM-NEXT: v_fma_f16 v2, -v1, v0, 1.0
374 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
375 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
376 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
377 ; VI-DENORM-NEXT: s_endpgm
379 ; VI-FLUSH-LABEL: multiple_fadd_use_test_f16:
381 ; VI-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8
382 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
383 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
384 ; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
385 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
386 ; VI-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0
387 ; VI-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc, |v1|, |v0|
388 ; VI-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
389 ; VI-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0|
390 ; VI-FLUSH-NEXT: v_mul_f16_e32 v1, v0, v0
391 ; VI-FLUSH-NEXT: v_mad_f16 v2, -v1, v0, 1.0
392 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
393 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
394 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
395 ; VI-FLUSH-NEXT: s_endpgm
397 ; GFX10-DENORM-LABEL: multiple_fadd_use_test_f16:
398 ; GFX10-DENORM: ; %bb.0:
399 ; GFX10-DENORM-NEXT: s_load_dword s0, s[4:5], 0x8
400 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v2, 0
401 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX10-DENORM-NEXT: s_lshr_b32 s1, s0, 16
403 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, s0, -1.0
404 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s1, -1.0
405 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
406 ; GFX10-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
407 ; GFX10-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
408 ; GFX10-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0|
409 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0
410 ; GFX10-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0
411 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
412 ; GFX10-DENORM-NEXT: global_store_short v2, v0, s[0:1]
413 ; GFX10-DENORM-NEXT: s_endpgm
415 ; GFX10-FLUSH-LABEL: multiple_fadd_use_test_f16:
416 ; GFX10-FLUSH: ; %bb.0:
417 ; GFX10-FLUSH-NEXT: s_load_dword s0, s[4:5], 0x8
418 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
420 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s0, -1.0
421 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v1, s1, -1.0
422 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
423 ; GFX10-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
424 ; GFX10-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
425 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0|
426 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v0, v0
427 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v1, v0
428 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
429 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0
430 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1]
432 ; GFX10-FLUSH-NEXT: s_endpgm
434 ; GFX11-DENORM-LABEL: multiple_fadd_use_test_f16:
435 ; GFX11-DENORM: ; %bb.0:
436 ; GFX11-DENORM-NEXT: s_clause 0x1
437 ; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8
438 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
439 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v2, 0
440 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16
442 ; GFX11-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
443 ; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s3, -1.0
444 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
445 ; GFX11-DENORM-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
446 ; GFX11-DENORM-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
447 ; GFX11-DENORM-NEXT: v_add_f16_e64 v0, |v0|, |v0|
448 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
449 ; GFX11-DENORM-NEXT: v_mul_f16_e32 v1, v0, v0
450 ; GFX11-DENORM-NEXT: v_fma_f16 v0, -v1, v0, 1.0
451 ; GFX11-DENORM-NEXT: global_store_b16 v2, v0, s[0:1]
452 ; GFX11-DENORM-NEXT: s_nop 0
453 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
454 ; GFX11-DENORM-NEXT: s_endpgm
456 ; GFX11-FLUSH-LABEL: multiple_fadd_use_test_f16:
457 ; GFX11-FLUSH: ; %bb.0:
458 ; GFX11-FLUSH-NEXT: s_clause 0x1
459 ; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8
460 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
461 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
462 ; GFX11-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
463 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
464 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v1, s3, -1.0
465 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
466 ; GFX11-FLUSH-NEXT: v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
467 ; GFX11-FLUSH-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
468 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |v0|, |v0|
469 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
470 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v0, v0
471 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v1, v0
472 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0
473 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2)
474 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v0, 1.0, v0
475 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1]
476 ; GFX11-FLUSH-NEXT: s_nop 0
477 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
478 ; GFX11-FLUSH-NEXT: s_endpgm
479 %x = bitcast i16 %x.arg to half
480 %y = bitcast i16 %y.arg to half
481 %z = bitcast i16 %z.arg to half
482 %a11 = fadd half %y, -1.0
483 %a12 = call half @llvm.fabs.f16(half %a11)
484 %a13 = fadd half %x, -1.0
485 %a14 = call half @llvm.fabs.f16(half %a13)
486 %a15 = fcmp ogt half %a12, %a14
487 %a16 = select i1 %a15, half %a12, half %a14
488 %a17 = fmul half %a16, 2.0
489 %a18 = fmul half %a17, %a17
490 %a19 = fmul half %a18, %a17
491 %a20 = fsub half 1.0, %a19
492 store half %a20, ptr addrspace(1) %out
496 define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
497 ; VI-DENORM-LABEL: multiple_use_fadd_fmac_f16:
498 ; VI-DENORM: ; %bb.0:
499 ; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8
500 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
501 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
502 ; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16
503 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
504 ; VI-DENORM-NEXT: v_fma_f16 v3, s6, 2.0, v0
505 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
506 ; VI-DENORM-NEXT: v_add_f16_e64 v2, s6, s6
507 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
508 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
509 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
510 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
511 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
512 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
513 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
514 ; VI-DENORM-NEXT: flat_store_short v[0:1], v3
515 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
516 ; VI-DENORM-NEXT: s_endpgm
518 ; VI-FLUSH-LABEL: multiple_use_fadd_fmac_f16:
520 ; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8
521 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
522 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
523 ; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16
524 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
525 ; VI-FLUSH-NEXT: v_add_f16_e64 v2, s6, s6
526 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2
527 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
528 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s3
529 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0
530 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
531 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
532 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
533 ; VI-FLUSH-NEXT: v_mac_f16_e64 v3, s6, 2.0
534 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
535 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
536 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
537 ; VI-FLUSH-NEXT: s_endpgm
539 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmac_f16:
540 ; GFX10-DENORM: ; %bb.0:
541 ; GFX10-DENORM-NEXT: s_clause 0x1
542 ; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8
543 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
544 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
545 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
546 ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16
547 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, s2, s2
548 ; GFX10-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3
549 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
550 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
551 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] offset:2
552 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
553 ; GFX10-DENORM-NEXT: s_endpgm
555 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmac_f16:
556 ; GFX10-FLUSH: ; %bb.0:
557 ; GFX10-FLUSH-NEXT: s_clause 0x1
558 ; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8
559 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
560 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
561 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2
563 ; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
564 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
565 ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1]
566 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
567 ; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1] offset:2
568 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
569 ; GFX10-FLUSH-NEXT: s_endpgm
571 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmac_f16:
572 ; GFX11-DENORM: ; %bb.0:
573 ; GFX11-DENORM-NEXT: s_clause 0x1
574 ; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8
575 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
576 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
577 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
578 ; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16
579 ; GFX11-DENORM-NEXT: v_add_f16_e64 v1, s2, s2
580 ; GFX11-DENORM-NEXT: v_fma_f16 v2, s2, 2.0, s3
581 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc
582 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
583 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc
584 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
585 ; GFX11-DENORM-NEXT: s_nop 0
586 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
587 ; GFX11-DENORM-NEXT: s_endpgm
589 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmac_f16:
590 ; GFX11-FLUSH: ; %bb.0:
591 ; GFX11-FLUSH-NEXT: s_clause 0x1
592 ; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8
593 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
594 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0
595 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
596 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, s2, s2
597 ; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
598 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
599 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
600 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc
601 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
602 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc
603 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
604 ; GFX11-FLUSH-NEXT: s_nop 0
605 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
606 ; GFX11-FLUSH-NEXT: s_endpgm
607 %x = bitcast i16 %x.arg to half
608 %y = bitcast i16 %y.arg to half
609 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
610 %mul2 = fmul fast half %x, 2.0
611 %mad = fadd fast half %mul2, %y
612 store volatile half %mul2, ptr addrspace(1) %out
613 store volatile half %mad, ptr addrspace(1) %out.gep.1
617 define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
618 ; VI-DENORM-LABEL: multiple_use_fadd_fmad_f16:
619 ; VI-DENORM: ; %bb.0:
620 ; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8
621 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
622 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
623 ; VI-DENORM-NEXT: s_lshr_b32 s3, s6, 16
624 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
625 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
626 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
627 ; VI-DENORM-NEXT: v_add_f16_e64 v2, |s6|, |s6|
628 ; VI-DENORM-NEXT: s_add_u32 s2, s0, 2
629 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
630 ; VI-DENORM-NEXT: s_addc_u32 s3, s1, 0
631 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
632 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
633 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
634 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
635 ; VI-DENORM-NEXT: flat_store_short v[0:1], v3
636 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
637 ; VI-DENORM-NEXT: s_endpgm
639 ; VI-FLUSH-LABEL: multiple_use_fadd_fmad_f16:
641 ; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8
642 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
643 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
644 ; VI-FLUSH-NEXT: s_lshr_b32 s3, s6, 16
645 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
646 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
647 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
648 ; VI-FLUSH-NEXT: v_add_f16_e64 v2, |s6|, |s6|
649 ; VI-FLUSH-NEXT: s_add_u32 s2, s0, 2
650 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
651 ; VI-FLUSH-NEXT: s_addc_u32 s3, s1, 0
652 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
653 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
654 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
655 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
656 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
657 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
658 ; VI-FLUSH-NEXT: s_endpgm
660 ; GFX10-DENORM-LABEL: multiple_use_fadd_fmad_f16:
661 ; GFX10-DENORM: ; %bb.0:
662 ; GFX10-DENORM-NEXT: s_clause 0x1
663 ; GFX10-DENORM-NEXT: s_load_dword s2, s[4:5], 0x8
664 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
665 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
666 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX10-DENORM-NEXT: s_lshr_b32 s3, s2, 16
668 ; GFX10-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2|
669 ; GFX10-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3
670 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
671 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
672 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1] offset:2
673 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
674 ; GFX10-DENORM-NEXT: s_endpgm
676 ; GFX10-FLUSH-LABEL: multiple_use_fadd_fmad_f16:
677 ; GFX10-FLUSH: ; %bb.0:
678 ; GFX10-FLUSH-NEXT: s_clause 0x1
679 ; GFX10-FLUSH-NEXT: s_load_dword s2, s[4:5], 0x8
680 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
681 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
682 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
683 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2|
684 ; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
685 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
686 ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[0:1]
687 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
688 ; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[0:1] offset:2
689 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
690 ; GFX10-FLUSH-NEXT: s_endpgm
692 ; GFX11-DENORM-LABEL: multiple_use_fadd_fmad_f16:
693 ; GFX11-DENORM: ; %bb.0:
694 ; GFX11-DENORM-NEXT: s_clause 0x1
695 ; GFX11-DENORM-NEXT: s_load_b32 s2, s[0:1], 0x8
696 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
697 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
698 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
699 ; GFX11-DENORM-NEXT: s_lshr_b32 s3, s2, 16
700 ; GFX11-DENORM-NEXT: v_add_f16_e64 v1, |s2|, |s2|
701 ; GFX11-DENORM-NEXT: v_fma_f16 v2, |s2|, 2.0, s3
702 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc
703 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
704 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc
705 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
706 ; GFX11-DENORM-NEXT: s_nop 0
707 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
708 ; GFX11-DENORM-NEXT: s_endpgm
710 ; GFX11-FLUSH-LABEL: multiple_use_fadd_fmad_f16:
711 ; GFX11-FLUSH: ; %bb.0:
712 ; GFX11-FLUSH-NEXT: s_clause 0x1
713 ; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x8
714 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
715 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0
716 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s2|, |s2|
718 ; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
719 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
720 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
721 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] dlc
722 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
723 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] offset:2 dlc
724 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
725 ; GFX11-FLUSH-NEXT: s_nop 0
726 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
727 ; GFX11-FLUSH-NEXT: s_endpgm
728 %x = bitcast i16 %x.arg to half
729 %y = bitcast i16 %y.arg to half
730 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
731 %x.abs = call half @llvm.fabs.f16(half %x)
732 %mul2 = fmul fast half %x.abs, 2.0
733 %mad = fadd fast half %mul2, %y
734 store volatile half %mul2, ptr addrspace(1) %out
735 store volatile half %mad, ptr addrspace(1) %out.gep.1
739 define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
740 ; VI-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
741 ; VI-DENORM: ; %bb.0:
742 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
743 ; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
744 ; VI-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8
745 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
746 ; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
747 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
748 ; VI-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, v0
749 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
750 ; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
751 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
752 ; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
753 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
754 ; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
755 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
756 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
757 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
758 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
759 ; VI-DENORM-NEXT: flat_store_short v[0:1], v3
760 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
761 ; VI-DENORM-NEXT: s_endpgm
763 ; VI-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
765 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
766 ; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
767 ; VI-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8
768 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
769 ; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
770 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
771 ; VI-FLUSH-NEXT: v_mad_f16 v2, |s6|, 2.0, v0
772 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
773 ; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
774 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
775 ; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
776 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
777 ; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
778 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
779 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
780 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
781 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
782 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
783 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
784 ; VI-FLUSH-NEXT: s_endpgm
786 ; GFX10-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
787 ; GFX10-DENORM: ; %bb.0:
788 ; GFX10-DENORM-NEXT: s_clause 0x2
789 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
790 ; GFX10-DENORM-NEXT: s_load_dword s6, s[4:5], 0x8
791 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
792 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
793 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
794 ; GFX10-DENORM-NEXT: s_lshr_b32 s0, s0, 16
795 ; GFX10-DENORM-NEXT: v_fma_f16 v2, |s6|, 2.0, s1
796 ; GFX10-DENORM-NEXT: v_fma_f16 v1, |s6|, 2.0, s0
797 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3]
798 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
799 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3] offset:2
800 ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
801 ; GFX10-DENORM-NEXT: s_endpgm
803 ; GFX10-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
804 ; GFX10-FLUSH: ; %bb.0:
805 ; GFX10-FLUSH-NEXT: s_clause 0x2
806 ; GFX10-FLUSH-NEXT: s_load_dword s6, s[4:5], 0x8
807 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
808 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
809 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v1, 0
810 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
811 ; GFX10-FLUSH-NEXT: v_add_f16_e64 v0, |s6|, |s6|
812 ; GFX10-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
813 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v2, s0, v0
814 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, s1, v0
815 ; GFX10-FLUSH-NEXT: global_store_short v1, v2, s[2:3]
816 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
817 ; GFX10-FLUSH-NEXT: global_store_short v1, v0, s[2:3] offset:2
818 ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
819 ; GFX10-FLUSH-NEXT: s_endpgm
821 ; GFX11-DENORM-LABEL: multiple_use_fadd_multi_fmad_f16:
822 ; GFX11-DENORM: ; %bb.0:
823 ; GFX11-DENORM-NEXT: s_clause 0x2
824 ; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
825 ; GFX11-DENORM-NEXT: s_load_b32 s4, s[0:1], 0x8
826 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
827 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
828 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
829 ; GFX11-DENORM-NEXT: s_lshr_b32 s2, s2, 16
830 ; GFX11-DENORM-NEXT: v_fma_f16 v2, |s4|, 2.0, s3
831 ; GFX11-DENORM-NEXT: v_fma_f16 v1, |s4|, 2.0, s2
832 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1] dlc
833 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
834 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 dlc
835 ; GFX11-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
836 ; GFX11-DENORM-NEXT: s_nop 0
837 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
838 ; GFX11-DENORM-NEXT: s_endpgm
840 ; GFX11-FLUSH-LABEL: multiple_use_fadd_multi_fmad_f16:
841 ; GFX11-FLUSH: ; %bb.0:
842 ; GFX11-FLUSH-NEXT: s_clause 0x2
843 ; GFX11-FLUSH-NEXT: s_load_b32 s4, s[0:1], 0x8
844 ; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
845 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
846 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v1, 0
847 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
848 ; GFX11-FLUSH-NEXT: v_add_f16_e64 v0, |s4|, |s4|
849 ; GFX11-FLUSH-NEXT: s_lshr_b32 s2, s2, 16
850 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
851 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, s2, v0
852 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, s3, v0
853 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v2, s[0:1] dlc
854 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
855 ; GFX11-FLUSH-NEXT: global_store_b16 v1, v0, s[0:1] offset:2 dlc
856 ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
857 ; GFX11-FLUSH-NEXT: s_nop 0
858 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
859 ; GFX11-FLUSH-NEXT: s_endpgm
860 %x = bitcast i16 %x.arg to half
861 %y = bitcast i16 %y.arg to half
862 %z = bitcast i16 %z.arg to half
863 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
864 %x.abs = call half @llvm.fabs.f16(half %x)
865 %mul2 = fmul fast half %x.abs, 2.0
866 %mad0 = fadd fast half %mul2, %y
867 %mad1 = fadd fast half %mul2, %z
868 store volatile half %mad0, ptr addrspace(1) %out
869 store volatile half %mad1, ptr addrspace(1) %out.gep.1
873 define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
874 ; VI-LABEL: fmul_x2_xn2_f16:
876 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
877 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
878 ; VI-NEXT: s_waitcnt lgkmcnt(0)
879 ; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0
880 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0
881 ; VI-NEXT: v_mov_b32_e32 v0, s0
882 ; VI-NEXT: v_mov_b32_e32 v1, s1
883 ; VI-NEXT: flat_store_short v[0:1], v2
884 ; VI-NEXT: s_waitcnt vmcnt(0)
887 ; GFX10-LABEL: fmul_x2_xn2_f16:
889 ; GFX10-NEXT: s_clause 0x1
890 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
891 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
892 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
893 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
894 ; GFX10-NEXT: v_mul_f16_e64 v0, s2, -4.0
895 ; GFX10-NEXT: v_mul_f16_e32 v0, s2, v0
896 ; GFX10-NEXT: global_store_short v1, v0, s[0:1]
897 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
898 ; GFX10-NEXT: s_endpgm
900 ; GFX11-LABEL: fmul_x2_xn2_f16:
902 ; GFX11-NEXT: s_clause 0x1
903 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
904 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
905 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
906 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
907 ; GFX11-NEXT: v_mul_f16_e64 v0, s2, -4.0
908 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
909 ; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0
910 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc
911 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
912 ; GFX11-NEXT: s_nop 0
913 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
914 ; GFX11-NEXT: s_endpgm
915 %x = bitcast i16 %x.arg to half
916 %y = bitcast i16 %y.arg to half
917 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
918 %mul2 = fmul fast half %x, 2.0
919 %muln2 = fmul fast half %x, -2.0
920 %mul = fmul fast half %mul2, %muln2
921 store volatile half %mul, ptr addrspace(1) %out
925 define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
926 ; VI-LABEL: fmul_x2_xn3_f16:
928 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
929 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
930 ; VI-NEXT: v_mov_b32_e32 v0, 0xc600
931 ; VI-NEXT: s_waitcnt lgkmcnt(0)
932 ; VI-NEXT: v_mul_f16_e32 v0, s2, v0
933 ; VI-NEXT: v_mul_f16_e32 v2, s2, v0
934 ; VI-NEXT: v_mov_b32_e32 v0, s0
935 ; VI-NEXT: v_mov_b32_e32 v1, s1
936 ; VI-NEXT: flat_store_short v[0:1], v2
937 ; VI-NEXT: s_waitcnt vmcnt(0)
940 ; GFX10-LABEL: fmul_x2_xn3_f16:
942 ; GFX10-NEXT: s_clause 0x1
943 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
944 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
945 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
946 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
947 ; GFX10-NEXT: v_mul_f16_e64 v0, 0xc600, s2
948 ; GFX10-NEXT: v_mul_f16_e32 v0, s2, v0
949 ; GFX10-NEXT: global_store_short v1, v0, s[0:1]
950 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
951 ; GFX10-NEXT: s_endpgm
953 ; GFX11-LABEL: fmul_x2_xn3_f16:
955 ; GFX11-NEXT: s_clause 0x1
956 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
957 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
958 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
959 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
960 ; GFX11-NEXT: v_mul_f16_e64 v0, 0xc600, s2
961 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
962 ; GFX11-NEXT: v_mul_f16_e32 v0, s2, v0
963 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] dlc
964 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
965 ; GFX11-NEXT: s_nop 0
966 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
967 ; GFX11-NEXT: s_endpgm
968 %x = bitcast i16 %x.arg to half
969 %y = bitcast i16 %y.arg to half
970 %out.gep.1 = getelementptr half, ptr addrspace(1) %out, i32 1
971 %mul2 = fmul fast half %x, 2.0
972 %muln2 = fmul fast half %x, -3.0
973 %mul = fmul fast half %mul2, %muln2
974 store volatile half %mul, ptr addrspace(1) %out
978 attributes #0 = { nounwind "unsafe-fp-math"="true" }
979 attributes #1 = { nounwind readnone }