1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
3 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
5 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
6 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
8 ; FIXME: This should also fold when fma is actually fast if an FMA
9 ; exists in the original program.
11 ; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
12 define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
13 ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul:
15 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
16 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
17 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
18 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
19 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
20 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
21 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
22 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
23 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
24 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
25 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
26 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
27 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v4
28 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
29 ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
30 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
31 ; GCN-FLUSH-NEXT: s_endpgm
33 ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul:
34 ; GCN-FASTFMA: ; %bb.0:
35 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
36 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
37 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
38 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
39 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
40 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
41 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
42 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
43 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
44 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
45 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
46 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
47 ; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, v2
48 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
49 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
50 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
51 ; GCN-FASTFMA-NEXT: s_endpgm
53 ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul:
54 ; GCN-SLOWFMA: ; %bb.0:
55 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
56 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
57 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
58 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
59 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
60 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
61 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
62 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
63 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
64 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
65 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
66 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
67 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
68 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
69 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
70 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
71 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
72 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
73 ; GCN-SLOWFMA-NEXT: s_endpgm
74 %x = load volatile float, ptr addrspace(1) undef
75 %y = load volatile float, ptr addrspace(1) undef
76 %z = load volatile float, ptr addrspace(1) undef
77 %u = load volatile float, ptr addrspace(1) undef
78 %v = load volatile float, ptr addrspace(1) undef
79 %mul.u.v = fmul fast float %u, %v
80 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
81 %add = fadd fast float %fma, %z
82 store volatile float %add, ptr addrspace(1) undef
86 define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
87 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul:
89 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
90 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
91 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
92 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
93 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
94 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
95 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
96 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
97 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
98 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
99 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
100 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
101 ; GCN-FLUSH-NEXT: v_mad_f32 v2, v3, v4, -v2
102 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
103 ; GCN-FLUSH-NEXT: buffer_store_dword v2, off, s[0:3], 0
104 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
105 ; GCN-FLUSH-NEXT: s_endpgm
107 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul:
108 ; GCN-FASTFMA: ; %bb.0:
109 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
110 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
111 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
112 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
113 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
114 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
115 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
116 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
117 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
118 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
119 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
120 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
121 ; GCN-FASTFMA-NEXT: v_fma_f32 v2, v3, v4, -v2
122 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
123 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
124 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
125 ; GCN-FASTFMA-NEXT: s_endpgm
127 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul:
128 ; GCN-SLOWFMA: ; %bb.0:
129 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
130 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
131 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
132 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
133 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
134 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
135 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
136 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
137 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
138 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
139 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
140 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
141 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
142 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
143 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
144 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v0, v0, v2
145 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
146 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
147 ; GCN-SLOWFMA-NEXT: s_endpgm
148 %x = load volatile float, ptr addrspace(1) undef
149 %y = load volatile float, ptr addrspace(1) undef
150 %z = load volatile float, ptr addrspace(1) undef
151 %u = load volatile float, ptr addrspace(1) undef
152 %v = load volatile float, ptr addrspace(1) undef
153 %mul.u.v = fmul fast float %u, %v
154 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
155 %add = fsub fast float %fma, %z
156 store volatile float %add, ptr addrspace(1) undef
160 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
161 ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
162 ; GCN-FLUSH: ; %bb.0:
163 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
164 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
165 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
166 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
167 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
168 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
169 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
170 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
171 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
172 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
173 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
174 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
175 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
176 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
177 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0)
178 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
179 ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2
180 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
181 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
182 ; GCN-FLUSH-NEXT: s_endpgm
184 ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
185 ; GCN-FASTFMA: ; %bb.0:
186 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
187 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
188 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
189 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
190 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
191 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
192 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
193 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
194 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
195 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
196 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
197 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
198 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
199 ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
200 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
201 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
202 ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2
203 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
204 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
205 ; GCN-FASTFMA-NEXT: s_endpgm
207 ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
208 ; GCN-SLOWFMA: ; %bb.0:
209 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
210 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
211 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
212 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
213 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
214 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
215 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
216 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
217 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
218 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
219 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
220 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
221 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
222 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
223 ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
224 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
225 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
226 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
227 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
228 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
229 ; GCN-SLOWFMA-NEXT: s_endpgm
230 %x = load volatile float, ptr addrspace(1) undef
231 %y = load volatile float, ptr addrspace(1) undef
232 %z = load volatile float, ptr addrspace(1) undef
233 %u = load volatile float, ptr addrspace(1) undef
234 %v = load volatile float, ptr addrspace(1) undef
235 %mul.u.v = fmul fast float %u, %v
236 store volatile float %mul.u.v, ptr addrspace(1) undef
237 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
238 %add = fadd fast float %fma, %z
239 store volatile float %add, ptr addrspace(1) undef
243 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
244 ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
245 ; GCN-FLUSH: ; %bb.0:
246 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
247 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
248 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
249 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
250 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
251 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
252 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
253 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
254 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
255 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
256 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
257 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
258 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
259 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
260 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0)
261 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
262 ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3
263 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
264 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
265 ; GCN-FLUSH-NEXT: s_endpgm
267 ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
268 ; GCN-FASTFMA: ; %bb.0:
269 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
270 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
271 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
272 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
273 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
274 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
275 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
276 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
277 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
278 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
279 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
280 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
281 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
282 ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
283 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
284 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
285 ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0
286 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
287 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
288 ; GCN-FASTFMA-NEXT: s_endpgm
290 ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
291 ; GCN-SLOWFMA: ; %bb.0:
292 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
293 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
294 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
295 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
296 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
297 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
298 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
299 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
300 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
301 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
302 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
303 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
304 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
305 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
306 ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
307 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
308 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
309 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v2, v0
310 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
311 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
312 ; GCN-SLOWFMA-NEXT: s_endpgm
313 %x = load volatile float, ptr addrspace(1) undef
314 %y = load volatile float, ptr addrspace(1) undef
315 %z = load volatile float, ptr addrspace(1) undef
316 %u = load volatile float, ptr addrspace(1) undef
317 %v = load volatile float, ptr addrspace(1) undef
318 %mul.u.v = fmul fast float %u, %v
319 store volatile float %mul.u.v, ptr addrspace(1) undef
320 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
321 %add = fadd fast float %z, %fma
322 store volatile float %add, ptr addrspace(1) undef
326 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
327 ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
328 ; GCN-FLUSH: ; %bb.0:
329 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
330 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
331 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
332 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
333 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
334 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
335 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
336 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
337 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
338 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
339 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
340 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
341 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
342 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
343 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
344 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
345 ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v3, v2
346 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
347 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
348 ; GCN-FLUSH-NEXT: s_endpgm
350 ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
351 ; GCN-FASTFMA: ; %bb.0:
352 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
353 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
354 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
355 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
356 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
357 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
358 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
359 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
360 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
361 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
362 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
363 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
364 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
365 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
366 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
367 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
368 ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2
369 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
370 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
371 ; GCN-FASTFMA-NEXT: s_endpgm
373 ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
374 ; GCN-SLOWFMA: ; %bb.0:
375 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
376 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
377 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
378 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
379 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
380 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
381 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
382 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
383 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
384 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
385 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
386 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
387 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
388 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
389 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
390 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
391 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
392 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
393 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
394 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
395 ; GCN-SLOWFMA-NEXT: s_endpgm
396 %x = load volatile float, ptr addrspace(1) undef
397 %y = load volatile float, ptr addrspace(1) undef
398 %z = load volatile float, ptr addrspace(1) undef
399 %u = load volatile float, ptr addrspace(1) undef
400 %v = load volatile float, ptr addrspace(1) undef
401 %mul.u.v = fmul fast float %u, %v
402 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
403 store volatile float %fma, ptr addrspace(1) undef
404 %add = fadd fast float %fma, %z
405 store volatile float %add, ptr addrspace(1) undef
409 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
410 ; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
411 ; GCN-FLUSH: ; %bb.0:
412 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
413 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
414 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
415 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
416 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
417 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
418 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
419 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
420 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
421 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
422 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
423 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
424 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
425 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
426 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
427 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
428 ; GCN-FLUSH-NEXT: v_add_f32_e32 v0, v2, v3
429 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
430 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
431 ; GCN-FLUSH-NEXT: s_endpgm
433 ; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
434 ; GCN-FASTFMA: ; %bb.0:
435 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
436 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
437 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
438 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
439 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
440 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
441 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
442 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
443 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
444 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
445 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
446 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
447 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
448 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
449 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
450 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
451 ; GCN-FASTFMA-NEXT: v_add_f32_e32 v0, v2, v0
452 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
453 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
454 ; GCN-FASTFMA-NEXT: s_endpgm
456 ; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
457 ; GCN-SLOWFMA: ; %bb.0:
458 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
459 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
460 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
461 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
462 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
463 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
464 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
465 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
466 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
467 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
468 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
469 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
470 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
471 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
472 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
473 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
474 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0)
475 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v2, v0
476 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
477 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
478 ; GCN-SLOWFMA-NEXT: s_endpgm
479 %x = load volatile float, ptr addrspace(1) undef
480 %y = load volatile float, ptr addrspace(1) undef
481 %z = load volatile float, ptr addrspace(1) undef
482 %u = load volatile float, ptr addrspace(1) undef
483 %v = load volatile float, ptr addrspace(1) undef
484 %mul.u.v = fmul fast float %u, %v
485 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
486 store volatile float %fma, ptr addrspace(1) undef
487 %add = fadd fast float %z, %fma
488 store volatile float %add, ptr addrspace(1) undef
492 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
493 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
494 ; GCN-FLUSH: ; %bb.0:
495 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
496 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
497 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
498 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
499 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
500 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
501 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
502 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
503 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
504 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
505 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
506 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
507 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
508 ; GCN-FLUSH-NEXT: v_mad_f32 v0, v0, v1, v3
509 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2
510 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
511 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
512 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
513 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
514 ; GCN-FLUSH-NEXT: s_endpgm
516 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
517 ; GCN-FASTFMA: ; %bb.0:
518 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
519 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
520 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
521 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
522 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
523 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
524 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
525 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
526 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
527 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
528 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
529 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
530 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
531 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
532 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v0, v0, v2
533 ; GCN-FASTFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
534 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
535 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
536 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
537 ; GCN-FASTFMA-NEXT: s_endpgm
539 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
540 ; GCN-SLOWFMA: ; %bb.0:
541 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
542 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
543 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
544 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
545 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
546 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
547 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
548 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
549 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
550 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
551 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
552 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
553 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
554 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
555 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
556 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v0, v0, v2
557 ; GCN-SLOWFMA-NEXT: buffer_store_dword v3, off, s[0:3], 0
558 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
559 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
560 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
561 ; GCN-SLOWFMA-NEXT: s_endpgm
562 %x = load volatile float, ptr addrspace(1) undef
563 %y = load volatile float, ptr addrspace(1) undef
564 %z = load volatile float, ptr addrspace(1) undef
565 %u = load volatile float, ptr addrspace(1) undef
566 %v = load volatile float, ptr addrspace(1) undef
567 %mul.u.v = fmul fast float %u, %v
568 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
569 %sub = fsub fast float %fma, %z
570 store volatile float %mul.u.v, ptr addrspace(1) undef
571 store volatile float %sub, ptr addrspace(1) undef
575 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
576 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
577 ; GCN-FLUSH: ; %bb.0:
578 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
579 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
580 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
581 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
582 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
583 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
584 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
585 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
586 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
587 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
588 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
589 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
590 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
591 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
592 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2
593 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
594 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
595 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
596 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
597 ; GCN-FLUSH-NEXT: s_endpgm
599 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
600 ; GCN-FASTFMA: ; %bb.0:
601 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
602 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
603 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
604 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
605 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
606 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
607 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
608 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
609 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
610 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
611 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
612 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
613 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
614 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
615 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2
616 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
617 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
618 ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
619 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
620 ; GCN-FASTFMA-NEXT: s_endpgm
622 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
623 ; GCN-SLOWFMA: ; %bb.0:
624 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
625 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
626 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
627 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
628 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
629 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
630 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
631 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
632 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
633 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
634 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
635 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
636 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
637 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
638 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
639 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2
640 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
641 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
642 ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
643 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
644 ; GCN-SLOWFMA-NEXT: s_endpgm
645 %x = load volatile float, ptr addrspace(1) undef
646 %y = load volatile float, ptr addrspace(1) undef
647 %z = load volatile float, ptr addrspace(1) undef
648 %u = load volatile float, ptr addrspace(1) undef
649 %v = load volatile float, ptr addrspace(1) undef
650 %mul.u.v = fmul fast float %u, %v
651 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
652 %add = fsub fast float %fma, %z
653 store volatile float %fma, ptr addrspace(1) undef
654 store volatile float %add, ptr addrspace(1) undef
658 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
659 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
660 ; GCN-FLUSH: ; %bb.0:
661 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
662 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
663 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
664 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
665 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
666 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
667 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
668 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
669 ; GCN-FLUSH-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
670 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
671 ; GCN-FLUSH-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
672 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
673 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
674 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
675 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3
676 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
677 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
678 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
679 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
680 ; GCN-FLUSH-NEXT: s_endpgm
682 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
683 ; GCN-FASTFMA: ; %bb.0:
684 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
685 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
686 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
687 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
688 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
689 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
690 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
691 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
692 ; GCN-FASTFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
693 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
694 ; GCN-FASTFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
695 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
696 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
697 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
698 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0
699 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
700 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
701 ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
702 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
703 ; GCN-FASTFMA-NEXT: s_endpgm
705 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
706 ; GCN-SLOWFMA: ; %bb.0:
707 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
708 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
709 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
710 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
711 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
712 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
713 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
714 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
715 ; GCN-SLOWFMA-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
716 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
717 ; GCN-SLOWFMA-NEXT: buffer_load_dword v4, off, s[0:3], 0 glc
718 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
719 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v3, v3, v4
720 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
721 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v3
722 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0
723 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
724 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
725 ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
726 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
727 ; GCN-SLOWFMA-NEXT: s_endpgm
728 %x = load volatile float, ptr addrspace(1) undef
729 %y = load volatile float, ptr addrspace(1) undef
730 %z = load volatile float, ptr addrspace(1) undef
731 %u = load volatile float, ptr addrspace(1) undef
732 %v = load volatile float, ptr addrspace(1) undef
733 %mul.u.v = fmul fast float %u, %v
734 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
735 %add = fsub fast float %z, %fma
736 store volatile float %fma, ptr addrspace(1) undef
737 store volatile float %add, ptr addrspace(1) undef
741 define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #0 {
742 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
743 ; GCN-FLUSH: ; %bb.0:
744 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
745 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
746 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
747 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
748 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
749 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
750 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
751 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
752 ; GCN-FLUSH-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
753 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
754 ; GCN-FLUSH-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
755 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
756 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
757 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4
758 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
759 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
760 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v3, v2
761 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
762 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
763 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
764 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
765 ; GCN-FLUSH-NEXT: s_endpgm
767 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
768 ; GCN-FASTFMA: ; %bb.0:
769 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
770 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
771 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
772 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
773 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
774 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
775 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
776 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
777 ; GCN-FASTFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
778 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
779 ; GCN-FASTFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
780 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
781 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3
782 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4
783 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
784 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
785 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v0, v2
786 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
787 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
788 ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
789 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
790 ; GCN-FASTFMA-NEXT: s_endpgm
792 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
793 ; GCN-SLOWFMA: ; %bb.0:
794 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
795 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
796 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
797 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
798 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
799 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
800 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
801 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
802 ; GCN-SLOWFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
803 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
804 ; GCN-SLOWFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
805 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
806 ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v3, v3
807 ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4
808 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
809 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4
810 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1
811 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v0, v2
812 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
813 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
814 ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
815 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
816 ; GCN-SLOWFMA-NEXT: s_endpgm
817 %x = load volatile float, ptr addrspace(1) undef
818 %y = load volatile float, ptr addrspace(1) undef
819 %z = load volatile float, ptr addrspace(1) undef
820 %u = load volatile half, ptr addrspace(1) undef
821 %v = load volatile half, ptr addrspace(1) undef
822 %mul.u.v.half = fmul fast half %u, %v
823 %mul.u.v = fpext half %mul.u.v.half to float
824 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
825 %add = fsub fast float %fma, %z
826 store volatile float %fma, ptr addrspace(1) undef
827 store volatile float %add, ptr addrspace(1) undef
831 define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() #0 {
832 ; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
833 ; GCN-FLUSH: ; %bb.0:
834 ; GCN-FLUSH-NEXT: s_mov_b32 s3, 0xf000
835 ; GCN-FLUSH-NEXT: s_mov_b32 s2, -1
836 ; GCN-FLUSH-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
837 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
838 ; GCN-FLUSH-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
839 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
840 ; GCN-FLUSH-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
841 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
842 ; GCN-FLUSH-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
843 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
844 ; GCN-FLUSH-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
845 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
846 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
847 ; GCN-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v4
848 ; GCN-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v4
849 ; GCN-FLUSH-NEXT: v_mac_f32_e32 v3, v0, v1
850 ; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v3
851 ; GCN-FLUSH-NEXT: buffer_store_dword v3, off, s[0:3], 0
852 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
853 ; GCN-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
854 ; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
855 ; GCN-FLUSH-NEXT: s_endpgm
857 ; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
858 ; GCN-FASTFMA: ; %bb.0:
859 ; GCN-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
860 ; GCN-FASTFMA-NEXT: s_mov_b32 s2, -1
861 ; GCN-FASTFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
862 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
863 ; GCN-FASTFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
864 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
865 ; GCN-FASTFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
866 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
867 ; GCN-FASTFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
868 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
869 ; GCN-FASTFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
870 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
871 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v3, v3
872 ; GCN-FASTFMA-NEXT: v_cvt_f32_f16_e32 v4, v4
873 ; GCN-FASTFMA-NEXT: v_mul_f32_e32 v3, v3, v4
874 ; GCN-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v3
875 ; GCN-FASTFMA-NEXT: v_sub_f32_e32 v1, v2, v0
876 ; GCN-FASTFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
877 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
878 ; GCN-FASTFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
879 ; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
880 ; GCN-FASTFMA-NEXT: s_endpgm
882 ; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
883 ; GCN-SLOWFMA: ; %bb.0:
884 ; GCN-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
885 ; GCN-SLOWFMA-NEXT: s_mov_b32 s2, -1
886 ; GCN-SLOWFMA-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
887 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
888 ; GCN-SLOWFMA-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
889 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
890 ; GCN-SLOWFMA-NEXT: buffer_load_dword v2, off, s[0:3], 0 glc
891 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
892 ; GCN-SLOWFMA-NEXT: buffer_load_ushort v3, off, s[0:3], 0 glc
893 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
894 ; GCN-SLOWFMA-NEXT: buffer_load_ushort v4, off, s[0:3], 0 glc
895 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
896 ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v3, v3
897 ; GCN-SLOWFMA-NEXT: v_cvt_f32_f16_e32 v4, v4
898 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
899 ; GCN-SLOWFMA-NEXT: v_mul_f32_e32 v1, v3, v4
900 ; GCN-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v1
901 ; GCN-SLOWFMA-NEXT: v_sub_f32_e32 v1, v2, v0
902 ; GCN-SLOWFMA-NEXT: buffer_store_dword v0, off, s[0:3], 0
903 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
904 ; GCN-SLOWFMA-NEXT: buffer_store_dword v1, off, s[0:3], 0
905 ; GCN-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
906 ; GCN-SLOWFMA-NEXT: s_endpgm
907 %x = load volatile float, ptr addrspace(1) undef
908 %y = load volatile float, ptr addrspace(1) undef
909 %z = load volatile float, ptr addrspace(1) undef
910 %u = load volatile half, ptr addrspace(1) undef
911 %v = load volatile half, ptr addrspace(1) undef
912 %mul.u.v.half = fmul fast half %u, %v
913 %mul.u.v = fpext half %mul.u.v.half to float
914 %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
915 %add = fsub fast float %z, %fma
916 store volatile float %fma, ptr addrspace(1) undef
917 store volatile float %add, ptr addrspace(1) undef
921 declare float @llvm.fma.f32(float, float, float) #1
922 declare float @llvm.fmuladd.f32(float, float, float) #1
924 attributes #0 = { nounwind }
925 attributes #1 = { nounwind readnone }
926 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: