1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s
3 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
4 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s
5 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s
7 ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) -> (fma x, y, (fma (fpext u), (fpext v), z))
8 define amdgpu_vs float @test_f16_f32_add_fma_ext_mul(float %x, float %y, float %z, half %u, half %v) {
9 ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul:
10 ; GFX9-DENORM: ; %bb.0: ; %.entry
11 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0]
12 ; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1
13 ; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2
14 ; GFX9-DENORM-NEXT: ; return to shader part epilog
16 ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul:
17 ; GFX10: ; %bb.0: ; %.entry
18 ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
19 ; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
20 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
21 ; GFX10-NEXT: ; return to shader part epilog
23 ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul:
24 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
25 ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
26 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
27 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2
28 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
30 ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul:
31 ; GFX10-DENORM: ; %bb.0: ; %.entry
32 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
33 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[0,0,1]
34 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2
35 ; GFX10-DENORM-NEXT: ; return to shader part epilog
38 %b = fpext half %a to float
39 %c = call float @llvm.fmuladd.f32(float %x, float %y, float %b)
40 %d = fadd float %c, %z
44 ; fold (fadd (fpext (fma x, y, (fmul u, v))), z) -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
45 define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z, half %u, half %v) {
46 ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul:
47 ; GFX9-DENORM: ; %bb.0: ; %.entry
48 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, v4, v2 op_sel_hi:[1,1,0]
49 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
50 ; GFX9-DENORM-NEXT: ; return to shader part epilog
52 ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul:
53 ; GFX10: ; %bb.0: ; %.entry
54 ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
55 ; GFX10-NEXT: v_fmac_f16_e32 v3, v0, v1
56 ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v3
57 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
58 ; GFX10-NEXT: ; return to shader part epilog
60 ; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul:
61 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
62 ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
63 ; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v0, v1
64 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v3
65 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2
66 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
68 ; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul:
69 ; GFX10-DENORM: ; %bb.0: ; %.entry
70 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
71 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
72 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v3
73 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
74 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2
75 ; GFX10-DENORM-NEXT: ; return to shader part epilog
78 %b = call half @llvm.fmuladd.f16(half %x, half %y, half %a)
79 %c = fpext half %b to float
80 %d = fadd float %c, %z
84 ; fold (fadd x, (fma y, z, (fpext (fmul u, v))) -> (fma y, z, (fma (fpext u), (fpext v), x))
85 define amdgpu_vs float @test_f16_f32_add_fma_ext_mul_rhs(float %x, float %y, float %z, half %u, half %v) {
86 ; GFX9-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs:
87 ; GFX9-DENORM: ; %bb.0: ; %.entry
88 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
89 ; GFX9-DENORM-NEXT: v_mac_f32_e32 v0, v1, v2
90 ; GFX9-DENORM-NEXT: ; return to shader part epilog
92 ; GFX10-LABEL: test_f16_f32_add_fma_ext_mul_rhs:
93 ; GFX10: ; %bb.0: ; %.entry
94 ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
95 ; GFX10-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
96 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
97 ; GFX10-NEXT: ; return to shader part epilog
99 ; GFX10-CONTRACT-LABEL: test_f16_f32_add_fma_ext_mul_rhs:
100 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
101 ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
102 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
103 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1
104 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
106 ; GFX10-DENORM-LABEL: test_f16_f32_add_fma_ext_mul_rhs:
107 ; GFX10-DENORM: ; %bb.0: ; %.entry
108 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
109 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v2, v3 op_sel_hi:[0,0,1]
110 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1
111 ; GFX10-DENORM-NEXT: ; return to shader part epilog
113 %a = fmul half %u, %v
114 %b = fpext half %a to float
115 %c = call float @llvm.fmuladd.f32(float %y, float %z, float %b)
116 %d = fadd float %x, %c
120 ; fold (fadd x, (fpext (fma y, z, (fmul u, v))) -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
121 define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half %z, half %u, half %v) {
122 ; GFX9-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
123 ; GFX9-DENORM: ; %bb.0: ; %.entry
124 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v3, v4, v0 op_sel_hi:[1,1,0]
125 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v1, v2, v0 op_sel_hi:[1,1,0]
126 ; GFX9-DENORM-NEXT: ; return to shader part epilog
128 ; GFX10-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
129 ; GFX10: ; %bb.0: ; %.entry
130 ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
131 ; GFX10-NEXT: v_fmac_f16_e32 v3, v1, v2
132 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v3
133 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
134 ; GFX10-NEXT: ; return to shader part epilog
136 ; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
137 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
138 ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
139 ; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
140 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v1, v3
141 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1
142 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
144 ; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
145 ; GFX10-DENORM: ; %bb.0: ; %.entry
146 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
147 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v1, v2
148 ; GFX10-DENORM-NEXT: v_add_f16_e32 v1, v1, v3
149 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
150 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1
151 ; GFX10-DENORM-NEXT: ; return to shader part epilog
153 %a = fmul half %u, %v
154 %b = call half @llvm.fmuladd.f16(half %y, half %z, half %a)
155 %c = fpext half %b to float
156 %d = fadd float %x, %c
160 ; fold (fadd (fma x, y, (fpext (fmul u, v))), z) -> (fma x, y, (fma (fpext u), (fpext v), z))
161 define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_fma_ext_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x half> %u, <4 x half> %v) {
162 ; GFX9-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul:
163 ; GFX9-DENORM: ; %bb.0: ; %.entry
164 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14
165 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15
166 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1]
167 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
168 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1]
169 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
170 ; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v8
171 ; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v9
172 ; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v10
173 ; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v11
174 ; GFX9-DENORM-NEXT: ; return to shader part epilog
176 ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul:
177 ; GFX10: ; %bb.0: ; %.entry
178 ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14
179 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15
180 ; GFX10-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1]
181 ; GFX10-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
182 ; GFX10-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1]
183 ; GFX10-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
184 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v8
185 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v9
186 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v10
187 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v11
188 ; GFX10-NEXT: ; return to shader part epilog
190 ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul:
191 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
192 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14
193 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15
194 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1]
195 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
196 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1]
197 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
198 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v8
199 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v9
200 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v10
201 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v11
202 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
204 ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul:
205 ; GFX10-DENORM: ; %bb.0: ; %.entry
206 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14
207 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15
208 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, v4, v12 op_sel_hi:[0,0,1]
209 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v1, v5, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
210 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v2, v6, v13 op_sel_hi:[0,0,1]
211 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v3, v7, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
212 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v8
213 ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v9
214 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v10
215 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v11
216 ; GFX10-DENORM-NEXT: ; return to shader part epilog
218 %a = fmul <4 x half> %u, %v
219 %b = fpext <4 x half> %a to <4 x float>
220 %c = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %b)
221 %d = fadd <4 x float> %c, %z
225 ; fold (fadd (fpext (fma x, y, (fmul u, v))), z) -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
226 define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4 x half> %y, <4 x float> %z, <4 x half> %u, <4 x half> %v) {
227 ; GFX9-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
228 ; GFX9-DENORM: ; %bb.0: ; %.entry
229 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v8, v8, v10
230 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11
231 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
232 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
233 ; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
234 ; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
235 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
236 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
237 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
238 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
239 ; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
240 ; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
241 ; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
242 ; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
243 ; GFX9-DENORM-NEXT: ; return to shader part epilog
245 ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
246 ; GFX10: ; %bb.0: ; %.entry
247 ; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10
248 ; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
249 ; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8
250 ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9
251 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
252 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
253 ; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1
254 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
255 ; GFX10-NEXT: v_add_f32_e32 v0, v2, v4
256 ; GFX10-NEXT: v_add_f32_e32 v1, v3, v5
257 ; GFX10-NEXT: v_add_f32_e32 v2, v8, v6
258 ; GFX10-NEXT: v_add_f32_e32 v3, v9, v7
259 ; GFX10-NEXT: ; return to shader part epilog
261 ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
262 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
263 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10
264 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
265 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8
266 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9
267 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0
268 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
269 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1
270 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
271 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4
272 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5
273 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6
274 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7
275 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
277 ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
278 ; GFX10-DENORM: ; %bb.0: ; %.entry
279 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v8, v8, v10
280 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
281 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11
282 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
283 ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
284 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2
285 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
286 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
287 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
288 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
289 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
290 ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
291 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
292 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
293 ; GFX10-DENORM-NEXT: ; return to shader part epilog
295 %a = fmul <4 x half> %u, %v
296 %b = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %a)
297 %c = fpext <4 x half> %b to <4 x float>
298 %d = fadd <4 x float> %c, %z
302 ; fold (fadd x, (fma y, z, (fpext (fmul u, v))) -> (fma y, z, (fma (fpext u), (fpext v), x))
303 define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_fma_ext_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x half> %u, <4 x half> %v) {
304 ; GFX9-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs:
305 ; GFX9-DENORM: ; %bb.0: ; %.entry
306 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14
307 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15
308 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1]
309 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
310 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1]
311 ; GFX9-DENORM-NEXT: v_mad_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
312 ; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v4
313 ; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v5
314 ; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v6
315 ; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v7
316 ; GFX9-DENORM-NEXT: ; return to shader part epilog
318 ; GFX10-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs:
319 ; GFX10: ; %bb.0: ; %.entry
320 ; GFX10-NEXT: v_pk_mul_f16 v12, v12, v14
321 ; GFX10-NEXT: v_pk_mul_f16 v13, v13, v15
322 ; GFX10-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1]
323 ; GFX10-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
324 ; GFX10-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1]
325 ; GFX10-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
326 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v4
327 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
328 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
329 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
330 ; GFX10-NEXT: ; return to shader part epilog
332 ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs:
333 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
334 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v12, v12, v14
335 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v13, v13, v15
336 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1]
337 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
338 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1]
339 ; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
340 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v4
341 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v5
342 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v6
343 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v7
344 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
346 ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_fma_ext_mul_rhs:
347 ; GFX10-DENORM: ; %bb.0: ; %.entry
348 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v12, v12, v14
349 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v13, v13, v15
350 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v4, v4, v8, v12 op_sel_hi:[0,0,1]
351 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v5, v5, v9, v12 op_sel:[0,0,1] op_sel_hi:[0,0,1]
352 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v6, v6, v10, v13 op_sel_hi:[0,0,1]
353 ; GFX10-DENORM-NEXT: v_fma_mix_f32 v7, v7, v11, v13 op_sel:[0,0,1] op_sel_hi:[0,0,1]
354 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v4
355 ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v5
356 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v6
357 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v7
358 ; GFX10-DENORM-NEXT: ; return to shader part epilog
360 %a = fmul <4 x half> %u, %v
361 %b = fpext <4 x half> %a to <4 x float>
362 %c = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %y, <4 x float> %z, <4 x float> %b)
363 %d = fadd <4 x float> %x, %c
367 ; fold (fadd x, (fpext (fma y, z, (fmul u, v))) -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
368 define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %x, <4 x half> %y, <4 x half> %z, <4 x half> %u, <4 x half> %v) {
369 ; GFX9-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
370 ; GFX9-DENORM: ; %bb.0: ; %.entry
371 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v8, v8, v10
372 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11
373 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6
374 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
375 ; GFX9-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
376 ; GFX9-DENORM-NEXT: v_pk_add_f16 v5, v5, v9
377 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
378 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
379 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
380 ; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
381 ; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
382 ; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
383 ; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
384 ; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
385 ; GFX9-DENORM-NEXT: ; return to shader part epilog
387 ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
388 ; GFX10: ; %bb.0: ; %.entry
389 ; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10
390 ; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
391 ; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8
392 ; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9
393 ; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4
394 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
395 ; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
396 ; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
397 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
398 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v4
399 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v7
400 ; GFX10-NEXT: v_add_f32_e32 v3, v3, v5
401 ; GFX10-NEXT: ; return to shader part epilog
403 ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
404 ; GFX10-CONTRACT: ; %bb.0: ; %.entry
405 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10
406 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
407 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
408 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
409 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4
410 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
411 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5
412 ; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
413 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6
414 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4
415 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7
416 ; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5
417 ; GFX10-CONTRACT-NEXT: ; return to shader part epilog
419 ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
420 ; GFX10-DENORM: ; %bb.0: ; %.entry
421 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v8, v8, v10
422 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v4, v4, v6
423 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v6, v9, v11
424 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
425 ; GFX10-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
426 ; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6
427 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
428 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
429 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
430 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
431 ; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
432 ; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
433 ; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
434 ; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
435 ; GFX10-DENORM-NEXT: ; return to shader part epilog
437 %a = fmul <4 x half> %u, %v
438 %b = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %y, <4 x half> %z, <4 x half> %a)
439 %c = fpext <4 x half> %b to <4 x float>
440 %d = fadd <4 x float> %x, %c
444 declare float @llvm.fmuladd.f32(float, float, float) #0
445 declare half @llvm.fmuladd.f16(half, half, half) #0
446 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #0
447 declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #0
449 attributes #0 = { nounwind readnone }