1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s
11 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
12 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
14 define amdgpu_kernel void @fmuladd_f16(
15 ; SI-LABEL: fmuladd_f16:
17 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
18 ; SI-NEXT: s_mov_b32 s11, 0xf000
19 ; SI-NEXT: s_mov_b32 s10, -1
20 ; SI-NEXT: s_mov_b32 s14, s10
21 ; SI-NEXT: s_mov_b32 s15, s11
22 ; SI-NEXT: s_waitcnt lgkmcnt(0)
23 ; SI-NEXT: s_mov_b32 s12, s2
24 ; SI-NEXT: s_mov_b32 s13, s3
25 ; SI-NEXT: s_mov_b32 s16, s4
26 ; SI-NEXT: s_mov_b32 s17, s5
27 ; SI-NEXT: s_mov_b32 s18, s10
28 ; SI-NEXT: s_mov_b32 s19, s11
29 ; SI-NEXT: s_mov_b32 s4, s6
30 ; SI-NEXT: s_mov_b32 s5, s7
31 ; SI-NEXT: s_mov_b32 s6, s10
32 ; SI-NEXT: s_mov_b32 s7, s11
33 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
34 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
35 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
36 ; SI-NEXT: s_mov_b32 s8, s0
37 ; SI-NEXT: s_mov_b32 s9, s1
38 ; SI-NEXT: s_waitcnt vmcnt(2)
39 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
40 ; SI-NEXT: s_waitcnt vmcnt(1)
41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
42 ; SI-NEXT: s_waitcnt vmcnt(0)
43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
44 ; SI-NEXT: v_mac_f32_e32 v2, v0, v1
45 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
46 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
49 ; VI-FLUSH-LABEL: fmuladd_f16:
51 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
52 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
53 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1
54 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10
55 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11
56 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
57 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
58 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
59 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4
60 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5
61 ; VI-FLUSH-NEXT: s_mov_b32 s18, s10
62 ; VI-FLUSH-NEXT: s_mov_b32 s19, s11
63 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6
64 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7
65 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10
66 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11
67 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
68 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
69 ; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
70 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0
71 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1
72 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
73 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1
74 ; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0
75 ; VI-FLUSH-NEXT: s_endpgm
77 ; VI-DENORM-LABEL: fmuladd_f16:
79 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
80 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
81 ; VI-DENORM-NEXT: s_mov_b32 s10, -1
82 ; VI-DENORM-NEXT: s_mov_b32 s14, s10
83 ; VI-DENORM-NEXT: s_mov_b32 s15, s11
84 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
85 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
86 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
87 ; VI-DENORM-NEXT: s_mov_b32 s16, s4
88 ; VI-DENORM-NEXT: s_mov_b32 s17, s5
89 ; VI-DENORM-NEXT: s_mov_b32 s18, s10
90 ; VI-DENORM-NEXT: s_mov_b32 s19, s11
91 ; VI-DENORM-NEXT: s_mov_b32 s4, s6
92 ; VI-DENORM-NEXT: s_mov_b32 s5, s7
93 ; VI-DENORM-NEXT: s_mov_b32 s6, s10
94 ; VI-DENORM-NEXT: s_mov_b32 s7, s11
95 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
96 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
97 ; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0
98 ; VI-DENORM-NEXT: s_mov_b32 s8, s0
99 ; VI-DENORM-NEXT: s_mov_b32 s9, s1
100 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
101 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
102 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0
103 ; VI-DENORM-NEXT: s_endpgm
105 ; GFX10-FLUSH-LABEL: fmuladd_f16:
106 ; GFX10-FLUSH: ; %bb.0:
107 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
108 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
109 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
110 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, s2
111 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, s3
112 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
113 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
114 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s10
116 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11
117 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12
118 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13
119 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[4:7], 0
120 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
121 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14
122 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s15
123 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
124 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
125 ; GFX10-FLUSH-NEXT: s_mov_b32 s0, s8
126 ; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[12:15], 0
127 ; GFX10-FLUSH-NEXT: s_mov_b32 s1, s9
128 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
129 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
130 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
131 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
132 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
133 ; GFX10-FLUSH-NEXT: s_endpgm
135 ; GFX10-DENORM-LABEL: fmuladd_f16:
136 ; GFX10-DENORM: ; %bb.0:
137 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
138 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
139 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
140 ; GFX10-DENORM-NEXT: s_mov_b32 s6, s2
141 ; GFX10-DENORM-NEXT: s_mov_b32 s7, s3
142 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
143 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
144 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
145 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
146 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX10-DENORM-NEXT: s_mov_b32 s4, s10
148 ; GFX10-DENORM-NEXT: s_mov_b32 s5, s11
149 ; GFX10-DENORM-NEXT: s_mov_b32 s16, s12
150 ; GFX10-DENORM-NEXT: s_mov_b32 s17, s13
151 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s14
152 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s15
153 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[4:7], 0
154 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
155 ; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
156 ; GFX10-DENORM-NEXT: s_mov_b32 s0, s8
157 ; GFX10-DENORM-NEXT: s_mov_b32 s1, s9
158 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
159 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
160 ; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0
161 ; GFX10-DENORM-NEXT: s_endpgm
163 ; GFX11-FLUSH-LABEL: fmuladd_f16:
164 ; GFX11-FLUSH: ; %bb.0:
165 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
166 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
167 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
168 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
169 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
170 ; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
171 ; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
172 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
174 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
175 ; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
176 ; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
177 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0
178 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0
179 ; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
180 ; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
181 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
182 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
183 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
184 ; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0
185 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
186 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
187 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
188 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
189 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
190 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
191 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
192 ; GFX11-FLUSH-NEXT: s_endpgm
194 ; GFX11-DENORM-LABEL: fmuladd_f16:
195 ; GFX11-DENORM: ; %bb.0:
196 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
197 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
198 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
199 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
200 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
201 ; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
202 ; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
203 ; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
204 ; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
205 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
207 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
208 ; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
209 ; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
210 ; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
211 ; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
212 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0
213 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0
214 ; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0
215 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
216 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
217 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
218 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
219 ; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0
220 ; GFX11-DENORM-NEXT: s_endpgm
224 ptr addrspace(1) %c) {
225 %a.val = load half, ptr addrspace(1) %a
226 %b.val = load half, ptr addrspace(1) %b
227 %c.val = load half, ptr addrspace(1) %c
228 %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val)
229 store half %r.val, ptr addrspace(1) %r
233 define amdgpu_kernel void @fmuladd_f16_imm_a(
234 ; SI-LABEL: fmuladd_f16_imm_a:
236 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
237 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
238 ; SI-NEXT: s_mov_b32 s7, 0xf000
239 ; SI-NEXT: s_mov_b32 s6, -1
240 ; SI-NEXT: s_mov_b32 s14, s6
241 ; SI-NEXT: s_waitcnt lgkmcnt(0)
242 ; SI-NEXT: s_mov_b32 s12, s2
243 ; SI-NEXT: s_mov_b32 s13, s3
244 ; SI-NEXT: s_mov_b32 s15, s7
245 ; SI-NEXT: s_mov_b32 s10, s6
246 ; SI-NEXT: s_mov_b32 s11, s7
247 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
248 ; SI-NEXT: s_waitcnt vmcnt(0)
249 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
250 ; SI-NEXT: s_waitcnt vmcnt(0)
251 ; SI-NEXT: s_mov_b32 s4, s0
252 ; SI-NEXT: s_mov_b32 s5, s1
253 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
254 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
255 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1
256 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
257 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
260 ; VI-FLUSH-LABEL: fmuladd_f16_imm_a:
262 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
263 ; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
264 ; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
265 ; VI-FLUSH-NEXT: s_mov_b32 s6, -1
266 ; VI-FLUSH-NEXT: s_mov_b32 s14, s6
267 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
268 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
269 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
270 ; VI-FLUSH-NEXT: s_mov_b32 s15, s7
271 ; VI-FLUSH-NEXT: s_mov_b32 s10, s6
272 ; VI-FLUSH-NEXT: s_mov_b32 s11, s7
273 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
274 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
275 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
276 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
277 ; VI-FLUSH-NEXT: s_mov_b32 s4, s0
278 ; VI-FLUSH-NEXT: s_mov_b32 s5, s1
279 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1
280 ; VI-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0
281 ; VI-FLUSH-NEXT: s_endpgm
283 ; VI-DENORM-LABEL: fmuladd_f16_imm_a:
284 ; VI-DENORM: ; %bb.0:
285 ; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
286 ; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
287 ; VI-DENORM-NEXT: s_mov_b32 s7, 0xf000
288 ; VI-DENORM-NEXT: s_mov_b32 s6, -1
289 ; VI-DENORM-NEXT: s_mov_b32 s14, s6
290 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
291 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
292 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
293 ; VI-DENORM-NEXT: s_mov_b32 s15, s7
294 ; VI-DENORM-NEXT: s_mov_b32 s10, s6
295 ; VI-DENORM-NEXT: s_mov_b32 s11, s7
296 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
297 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
298 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
299 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
300 ; VI-DENORM-NEXT: s_mov_b32 s4, s0
301 ; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200
302 ; VI-DENORM-NEXT: s_mov_b32 s5, s1
303 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1
304 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0
305 ; VI-DENORM-NEXT: s_endpgm
307 ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a:
308 ; GFX10-FLUSH: ; %bb.0:
309 ; GFX10-FLUSH-NEXT: s_clause 0x1
310 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
311 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
312 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, -1
313 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, 0x31016000
314 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s6
315 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s7
316 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, s6
317 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, s7
318 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
319 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
320 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
321 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s0
322 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
323 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
324 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
325 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
326 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s1
327 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
328 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
329 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0
330 ; GFX10-FLUSH-NEXT: s_endpgm
332 ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a:
333 ; GFX10-DENORM: ; %bb.0:
334 ; GFX10-DENORM-NEXT: s_clause 0x1
335 ; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
336 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
337 ; GFX10-DENORM-NEXT: s_mov_b32 s6, -1
338 ; GFX10-DENORM-NEXT: s_mov_b32 s7, 0x31016000
339 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s6
340 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s7
341 ; GFX10-DENORM-NEXT: s_mov_b32 s10, s6
342 ; GFX10-DENORM-NEXT: s_mov_b32 s11, s7
343 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
344 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
345 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
346 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
347 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
348 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
349 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
350 ; GFX10-DENORM-NEXT: s_mov_b32 s4, s0
351 ; GFX10-DENORM-NEXT: s_mov_b32 s5, s1
352 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
353 ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0
354 ; GFX10-DENORM-NEXT: s_endpgm
356 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a:
357 ; GFX11-FLUSH: ; %bb.0:
358 ; GFX11-FLUSH-NEXT: s_clause 0x1
359 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
360 ; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
361 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
362 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
363 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
364 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
365 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
366 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
367 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
368 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
369 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
370 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
371 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
372 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
373 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
374 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
375 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
376 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
377 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
378 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
379 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
380 ; GFX11-FLUSH-NEXT: s_endpgm
382 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a:
383 ; GFX11-DENORM: ; %bb.0:
384 ; GFX11-DENORM-NEXT: s_clause 0x1
385 ; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
386 ; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
387 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
388 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
389 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
390 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
391 ; GFX11-DENORM-NEXT: s_mov_b32 s6, s10
392 ; GFX11-DENORM-NEXT: s_mov_b32 s7, s11
393 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
394 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
395 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
396 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
397 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
398 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
399 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
400 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
401 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
402 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
403 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
404 ; GFX11-DENORM-NEXT: s_endpgm
407 ptr addrspace(1) %c) {
408 %b.val = load volatile half, ptr addrspace(1) %b
409 %c.val = load volatile half, ptr addrspace(1) %c
410 %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
411 store half %r.val, ptr addrspace(1) %r
415 define amdgpu_kernel void @fmuladd_f16_imm_b(
416 ; SI-LABEL: fmuladd_f16_imm_b:
418 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
419 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
420 ; SI-NEXT: s_mov_b32 s7, 0xf000
421 ; SI-NEXT: s_mov_b32 s6, -1
422 ; SI-NEXT: s_mov_b32 s14, s6
423 ; SI-NEXT: s_waitcnt lgkmcnt(0)
424 ; SI-NEXT: s_mov_b32 s12, s2
425 ; SI-NEXT: s_mov_b32 s13, s3
426 ; SI-NEXT: s_mov_b32 s15, s7
427 ; SI-NEXT: s_mov_b32 s10, s6
428 ; SI-NEXT: s_mov_b32 s11, s7
429 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
430 ; SI-NEXT: s_waitcnt vmcnt(0)
431 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
432 ; SI-NEXT: s_waitcnt vmcnt(0)
433 ; SI-NEXT: s_mov_b32 s4, s0
434 ; SI-NEXT: s_mov_b32 s5, s1
435 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
436 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
437 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1
438 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
439 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
442 ; VI-FLUSH-LABEL: fmuladd_f16_imm_b:
444 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
445 ; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
446 ; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
447 ; VI-FLUSH-NEXT: s_mov_b32 s6, -1
448 ; VI-FLUSH-NEXT: s_mov_b32 s14, s6
449 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
450 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
451 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
452 ; VI-FLUSH-NEXT: s_mov_b32 s15, s7
453 ; VI-FLUSH-NEXT: s_mov_b32 s10, s6
454 ; VI-FLUSH-NEXT: s_mov_b32 s11, s7
455 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
456 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
457 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
458 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
459 ; VI-FLUSH-NEXT: s_mov_b32 s4, s0
460 ; VI-FLUSH-NEXT: s_mov_b32 s5, s1
461 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1
462 ; VI-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0
463 ; VI-FLUSH-NEXT: s_endpgm
465 ; VI-DENORM-LABEL: fmuladd_f16_imm_b:
466 ; VI-DENORM: ; %bb.0:
467 ; VI-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
468 ; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
469 ; VI-DENORM-NEXT: s_mov_b32 s7, 0xf000
470 ; VI-DENORM-NEXT: s_mov_b32 s6, -1
471 ; VI-DENORM-NEXT: s_mov_b32 s14, s6
472 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
473 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
474 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
475 ; VI-DENORM-NEXT: s_mov_b32 s15, s7
476 ; VI-DENORM-NEXT: s_mov_b32 s10, s6
477 ; VI-DENORM-NEXT: s_mov_b32 s11, s7
478 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
479 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
480 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
481 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
482 ; VI-DENORM-NEXT: s_mov_b32 s4, s0
483 ; VI-DENORM-NEXT: s_movk_i32 s0, 0x4200
484 ; VI-DENORM-NEXT: s_mov_b32 s5, s1
485 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s0, v1
486 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0
487 ; VI-DENORM-NEXT: s_endpgm
489 ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b:
490 ; GFX10-FLUSH: ; %bb.0:
491 ; GFX10-FLUSH-NEXT: s_clause 0x1
492 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
493 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
494 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, -1
495 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, 0x31016000
496 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s6
497 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s7
498 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, s6
499 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, s7
500 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
501 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
502 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
503 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s0
504 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
505 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
506 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
507 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
508 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s1
509 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
510 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
511 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[4:7], 0
512 ; GFX10-FLUSH-NEXT: s_endpgm
514 ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b:
515 ; GFX10-DENORM: ; %bb.0:
516 ; GFX10-DENORM-NEXT: s_clause 0x1
517 ; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
518 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
519 ; GFX10-DENORM-NEXT: s_mov_b32 s6, -1
520 ; GFX10-DENORM-NEXT: s_mov_b32 s7, 0x31016000
521 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s6
522 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s7
523 ; GFX10-DENORM-NEXT: s_mov_b32 s10, s6
524 ; GFX10-DENORM-NEXT: s_mov_b32 s11, s7
525 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
526 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
527 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
528 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
529 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
530 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
531 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
532 ; GFX10-DENORM-NEXT: s_mov_b32 s4, s0
533 ; GFX10-DENORM-NEXT: s_mov_b32 s5, s1
534 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
535 ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0
536 ; GFX10-DENORM-NEXT: s_endpgm
538 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b:
539 ; GFX11-FLUSH: ; %bb.0:
540 ; GFX11-FLUSH-NEXT: s_clause 0x1
541 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
542 ; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
543 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
544 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
545 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
546 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
547 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
548 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
549 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
551 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
552 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
553 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
554 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
555 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
556 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
557 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
558 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
559 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
560 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
561 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
562 ; GFX11-FLUSH-NEXT: s_endpgm
564 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b:
565 ; GFX11-DENORM: ; %bb.0:
566 ; GFX11-DENORM-NEXT: s_clause 0x1
567 ; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
568 ; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
569 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
570 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
571 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
572 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
573 ; GFX11-DENORM-NEXT: s_mov_b32 s6, s10
574 ; GFX11-DENORM-NEXT: s_mov_b32 s7, s11
575 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
576 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
577 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
578 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
579 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
580 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
581 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
582 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
583 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
584 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
585 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
586 ; GFX11-DENORM-NEXT: s_endpgm
589 ptr addrspace(1) %c) {
590 %a.val = load volatile half, ptr addrspace(1) %a
591 %c.val = load volatile half, ptr addrspace(1) %c
592 %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
593 store half %r.val, ptr addrspace(1) %r
597 define amdgpu_kernel void @fmuladd_v2f16(
598 ; SI-LABEL: fmuladd_v2f16:
600 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
601 ; SI-NEXT: s_mov_b32 s11, 0xf000
602 ; SI-NEXT: s_mov_b32 s10, -1
603 ; SI-NEXT: s_mov_b32 s14, s10
604 ; SI-NEXT: s_mov_b32 s15, s11
605 ; SI-NEXT: s_waitcnt lgkmcnt(0)
606 ; SI-NEXT: s_mov_b32 s12, s2
607 ; SI-NEXT: s_mov_b32 s13, s3
608 ; SI-NEXT: s_mov_b32 s16, s4
609 ; SI-NEXT: s_mov_b32 s17, s5
610 ; SI-NEXT: s_mov_b32 s18, s10
611 ; SI-NEXT: s_mov_b32 s19, s11
612 ; SI-NEXT: s_mov_b32 s4, s6
613 ; SI-NEXT: s_mov_b32 s5, s7
614 ; SI-NEXT: s_mov_b32 s6, s10
615 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
616 ; SI-NEXT: s_mov_b32 s7, s11
617 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
618 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
619 ; SI-NEXT: s_mov_b32 s8, s0
620 ; SI-NEXT: s_mov_b32 s9, s1
621 ; SI-NEXT: s_waitcnt vmcnt(2)
622 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
623 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
624 ; SI-NEXT: s_waitcnt vmcnt(1)
625 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
626 ; SI-NEXT: s_waitcnt vmcnt(0)
627 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
628 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
629 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
630 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
631 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
632 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
633 ; SI-NEXT: v_mac_f32_e32 v5, v0, v4
634 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
635 ; SI-NEXT: v_mac_f32_e32 v2, v3, v1
636 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
637 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
638 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
639 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
642 ; VI-FLUSH-LABEL: fmuladd_v2f16:
644 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
645 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
646 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1
647 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10
648 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11
649 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
650 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
651 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
652 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4
653 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5
654 ; VI-FLUSH-NEXT: s_mov_b32 s18, s10
655 ; VI-FLUSH-NEXT: s_mov_b32 s19, s11
656 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6
657 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7
658 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10
659 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11
660 ; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
661 ; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
662 ; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
663 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0
664 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1
665 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
666 ; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
667 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
668 ; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
669 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
670 ; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
671 ; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
672 ; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
673 ; VI-FLUSH-NEXT: s_endpgm
675 ; VI-DENORM-LABEL: fmuladd_v2f16:
676 ; VI-DENORM: ; %bb.0:
677 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
678 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
679 ; VI-DENORM-NEXT: s_mov_b32 s10, -1
680 ; VI-DENORM-NEXT: s_mov_b32 s14, s10
681 ; VI-DENORM-NEXT: s_mov_b32 s15, s11
682 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
683 ; VI-DENORM-NEXT: s_mov_b32 s16, s4
684 ; VI-DENORM-NEXT: s_mov_b32 s17, s5
685 ; VI-DENORM-NEXT: s_mov_b32 s4, s6
686 ; VI-DENORM-NEXT: s_mov_b32 s5, s7
687 ; VI-DENORM-NEXT: s_mov_b32 s6, s10
688 ; VI-DENORM-NEXT: s_mov_b32 s7, s11
689 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
690 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
691 ; VI-DENORM-NEXT: s_mov_b32 s18, s10
692 ; VI-DENORM-NEXT: s_mov_b32 s19, s11
693 ; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
694 ; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
695 ; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
696 ; VI-DENORM-NEXT: s_mov_b32 s8, s0
697 ; VI-DENORM-NEXT: s_mov_b32 s9, s1
698 ; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
699 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
700 ; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
701 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v1
702 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
703 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v5, 16, v2
704 ; VI-DENORM-NEXT: v_fma_f16 v3, v5, v4, v3
705 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
706 ; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
707 ; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
708 ; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
709 ; VI-DENORM-NEXT: s_endpgm
711 ; GFX10-FLUSH-LABEL: fmuladd_v2f16:
712 ; GFX10-FLUSH: ; %bb.0:
713 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
714 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
715 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
716 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, s2
717 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, s3
718 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
719 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
720 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
721 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s10
722 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s11
723 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s12
724 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s13
725 ; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[4:7], 0
726 ; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
727 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s14
728 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s15
729 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
730 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
731 ; GFX10-FLUSH-NEXT: s_mov_b32 s0, s8
732 ; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[12:15], 0
733 ; GFX10-FLUSH-NEXT: s_mov_b32 s1, s9
734 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
735 ; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
736 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
737 ; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
738 ; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
739 ; GFX10-FLUSH-NEXT: s_endpgm
741 ; GFX10-DENORM-LABEL: fmuladd_v2f16:
742 ; GFX10-DENORM: ; %bb.0:
743 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
744 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
745 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
746 ; GFX10-DENORM-NEXT: s_mov_b32 s6, s2
747 ; GFX10-DENORM-NEXT: s_mov_b32 s7, s3
748 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
749 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
750 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
751 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
752 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX10-DENORM-NEXT: s_mov_b32 s4, s10
754 ; GFX10-DENORM-NEXT: s_mov_b32 s5, s11
755 ; GFX10-DENORM-NEXT: s_mov_b32 s16, s12
756 ; GFX10-DENORM-NEXT: s_mov_b32 s17, s13
757 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s14
758 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s15
759 ; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
760 ; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
761 ; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
762 ; GFX10-DENORM-NEXT: s_mov_b32 s0, s8
763 ; GFX10-DENORM-NEXT: s_mov_b32 s1, s9
764 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
765 ; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
766 ; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
767 ; GFX10-DENORM-NEXT: s_endpgm
769 ; GFX11-FLUSH-LABEL: fmuladd_v2f16:
770 ; GFX11-FLUSH: ; %bb.0:
771 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
772 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
773 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
774 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
775 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
776 ; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
777 ; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
778 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
779 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
780 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
781 ; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
782 ; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
783 ; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
784 ; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
785 ; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
786 ; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
787 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
788 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
789 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
790 ; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0
791 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
792 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
793 ; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
794 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
795 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
796 ; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
797 ; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0
798 ; GFX11-FLUSH-NEXT: s_endpgm
800 ; GFX11-DENORM-LABEL: fmuladd_v2f16:
801 ; GFX11-DENORM: ; %bb.0:
802 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
803 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
804 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
805 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
806 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
807 ; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
808 ; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
809 ; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
810 ; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
811 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
812 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
813 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
814 ; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
815 ; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
816 ; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
817 ; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
818 ; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
819 ; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
820 ; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
821 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
822 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
823 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
824 ; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
825 ; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0
826 ; GFX11-DENORM-NEXT: s_endpgm
830 ptr addrspace(1) %c) {
831 %a.val = load <2 x half>, ptr addrspace(1) %a
832 %b.val = load <2 x half>, ptr addrspace(1) %b
833 %c.val = load <2 x half>, ptr addrspace(1) %c
834 %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
835 store <2 x half> %r.val, ptr addrspace(1) %r