1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s
9 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s
11 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
12 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
14 define amdgpu_kernel void @fmuladd_f16(
15 ; SI-LABEL: fmuladd_f16:
17 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
18 ; SI-NEXT: s_mov_b32 s11, 0xf000
19 ; SI-NEXT: s_mov_b32 s10, -1
20 ; SI-NEXT: s_mov_b32 s14, s10
21 ; SI-NEXT: s_mov_b32 s15, s11
22 ; SI-NEXT: s_waitcnt lgkmcnt(0)
23 ; SI-NEXT: s_mov_b32 s12, s2
24 ; SI-NEXT: s_mov_b32 s13, s3
25 ; SI-NEXT: s_mov_b32 s16, s4
26 ; SI-NEXT: s_mov_b32 s17, s5
27 ; SI-NEXT: s_mov_b32 s18, s10
28 ; SI-NEXT: s_mov_b32 s19, s11
29 ; SI-NEXT: s_mov_b32 s4, s6
30 ; SI-NEXT: s_mov_b32 s5, s7
31 ; SI-NEXT: s_mov_b32 s6, s10
32 ; SI-NEXT: s_mov_b32 s7, s11
33 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
34 ; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
35 ; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
36 ; SI-NEXT: s_mov_b32 s8, s0
37 ; SI-NEXT: s_mov_b32 s9, s1
38 ; SI-NEXT: s_waitcnt vmcnt(2)
39 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
40 ; SI-NEXT: s_waitcnt vmcnt(1)
41 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
42 ; SI-NEXT: s_waitcnt vmcnt(0)
43 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
44 ; SI-NEXT: v_mac_f32_e32 v2, v0, v1
45 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
46 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
49 ; VI-FLUSH-LABEL: fmuladd_f16:
51 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
52 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
53 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1
54 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10
55 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11
56 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
57 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
58 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
59 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4
60 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5
61 ; VI-FLUSH-NEXT: s_mov_b32 s18, s10
62 ; VI-FLUSH-NEXT: s_mov_b32 s19, s11
63 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6
64 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7
65 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10
66 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11
67 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
68 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
69 ; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
70 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0
71 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1
72 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
73 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1
74 ; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0
75 ; VI-FLUSH-NEXT: s_endpgm
77 ; VI-DENORM-LABEL: fmuladd_f16:
79 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
80 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
81 ; VI-DENORM-NEXT: s_mov_b32 s10, -1
82 ; VI-DENORM-NEXT: s_mov_b32 s14, s10
83 ; VI-DENORM-NEXT: s_mov_b32 s15, s11
84 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
85 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
86 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
87 ; VI-DENORM-NEXT: s_mov_b32 s16, s4
88 ; VI-DENORM-NEXT: s_mov_b32 s17, s5
89 ; VI-DENORM-NEXT: s_mov_b32 s18, s10
90 ; VI-DENORM-NEXT: s_mov_b32 s19, s11
91 ; VI-DENORM-NEXT: s_mov_b32 s4, s6
92 ; VI-DENORM-NEXT: s_mov_b32 s5, s7
93 ; VI-DENORM-NEXT: s_mov_b32 s6, s10
94 ; VI-DENORM-NEXT: s_mov_b32 s7, s11
95 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
96 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
97 ; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0
98 ; VI-DENORM-NEXT: s_mov_b32 s8, s0
99 ; VI-DENORM-NEXT: s_mov_b32 s9, s1
100 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
101 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
102 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0
103 ; VI-DENORM-NEXT: s_endpgm
105 ; GFX10-FLUSH-LABEL: fmuladd_f16:
106 ; GFX10-FLUSH: ; %bb.0:
107 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
108 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
109 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
110 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
111 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
112 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
113 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
114 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
115 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
116 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
117 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
118 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
119 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
120 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
121 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
122 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
123 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
124 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
125 ; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
126 ; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
127 ; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
128 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
129 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
130 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
131 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
132 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0
133 ; GFX10-FLUSH-NEXT: s_endpgm
135 ; GFX10-DENORM-LABEL: fmuladd_f16:
136 ; GFX10-DENORM: ; %bb.0:
137 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
138 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
139 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
140 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
141 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
142 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
143 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
144 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
145 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
146 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
148 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
149 ; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
150 ; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
151 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
152 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
153 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
154 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
155 ; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
156 ; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
157 ; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
158 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
159 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
160 ; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0
161 ; GFX10-DENORM-NEXT: s_endpgm
163 ; GFX11-FLUSH-LABEL: fmuladd_f16:
164 ; GFX11-FLUSH: ; %bb.0:
165 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
166 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
167 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
168 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
169 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
170 ; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
171 ; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
172 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
174 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
175 ; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
176 ; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
177 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0
178 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0
179 ; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
180 ; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
181 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
182 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
183 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
184 ; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0
185 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
186 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
187 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
188 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
189 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
190 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
191 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
192 ; GFX11-FLUSH-NEXT: s_nop 0
193 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
194 ; GFX11-FLUSH-NEXT: s_endpgm
196 ; GFX11-DENORM-LABEL: fmuladd_f16:
197 ; GFX11-DENORM: ; %bb.0:
198 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
199 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
200 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
201 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
202 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
203 ; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
204 ; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
205 ; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
206 ; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
207 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
209 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
210 ; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
211 ; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
212 ; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
213 ; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
214 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0
215 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0
216 ; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0
217 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
218 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
219 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
220 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
221 ; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0
222 ; GFX11-DENORM-NEXT: s_nop 0
223 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
224 ; GFX11-DENORM-NEXT: s_endpgm
228 ptr addrspace(1) %c) {
229 %a.val = load half, ptr addrspace(1) %a
230 %b.val = load half, ptr addrspace(1) %b
231 %c.val = load half, ptr addrspace(1) %c
232 %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val)
233 store half %r.val, ptr addrspace(1) %r
237 define amdgpu_kernel void @fmuladd_f16_imm_a(
238 ; SI-LABEL: fmuladd_f16_imm_a:
240 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
241 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
242 ; SI-NEXT: s_mov_b32 s3, 0xf000
243 ; SI-NEXT: s_mov_b32 s2, -1
244 ; SI-NEXT: s_mov_b32 s14, s2
245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
246 ; SI-NEXT: s_mov_b32 s12, s6
247 ; SI-NEXT: s_mov_b32 s13, s7
248 ; SI-NEXT: s_mov_b32 s15, s3
249 ; SI-NEXT: s_mov_b32 s10, s2
250 ; SI-NEXT: s_mov_b32 s11, s3
251 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
252 ; SI-NEXT: s_waitcnt vmcnt(0)
253 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
254 ; SI-NEXT: s_waitcnt vmcnt(0)
255 ; SI-NEXT: s_mov_b32 s0, s4
256 ; SI-NEXT: s_mov_b32 s1, s5
257 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
258 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
259 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1
260 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
261 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
264 ; VI-FLUSH-LABEL: fmuladd_f16_imm_a:
266 ; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
267 ; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
268 ; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
269 ; VI-FLUSH-NEXT: s_mov_b32 s2, -1
270 ; VI-FLUSH-NEXT: s_mov_b32 s14, s2
271 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
272 ; VI-FLUSH-NEXT: s_mov_b32 s12, s6
273 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7
274 ; VI-FLUSH-NEXT: s_mov_b32 s15, s3
275 ; VI-FLUSH-NEXT: s_mov_b32 s10, s2
276 ; VI-FLUSH-NEXT: s_mov_b32 s11, s3
277 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
278 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
279 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
280 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
281 ; VI-FLUSH-NEXT: s_mov_b32 s0, s4
282 ; VI-FLUSH-NEXT: s_mov_b32 s1, s5
283 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1
284 ; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
285 ; VI-FLUSH-NEXT: s_endpgm
287 ; VI-DENORM-LABEL: fmuladd_f16_imm_a:
288 ; VI-DENORM: ; %bb.0:
289 ; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
290 ; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
291 ; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
292 ; VI-DENORM-NEXT: s_mov_b32 s2, -1
293 ; VI-DENORM-NEXT: s_mov_b32 s14, s2
294 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
295 ; VI-DENORM-NEXT: s_mov_b32 s12, s6
296 ; VI-DENORM-NEXT: s_mov_b32 s13, s7
297 ; VI-DENORM-NEXT: s_mov_b32 s15, s3
298 ; VI-DENORM-NEXT: s_mov_b32 s10, s2
299 ; VI-DENORM-NEXT: s_mov_b32 s11, s3
300 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
301 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
302 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
303 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
304 ; VI-DENORM-NEXT: s_mov_b32 s0, s4
305 ; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200
306 ; VI-DENORM-NEXT: s_mov_b32 s1, s5
307 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1
308 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
309 ; VI-DENORM-NEXT: s_endpgm
311 ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a:
312 ; GFX10-FLUSH: ; %bb.0:
313 ; GFX10-FLUSH-NEXT: s_clause 0x1
314 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
315 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
316 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
317 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
318 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
319 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
320 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
321 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
322 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
324 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
325 ; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
326 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
327 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
328 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
329 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
331 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
332 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
333 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
334 ; GFX10-FLUSH-NEXT: s_endpgm
336 ; GFX10-DENORM-LABEL: fmuladd_f16_imm_a:
337 ; GFX10-DENORM: ; %bb.0:
338 ; GFX10-DENORM-NEXT: s_clause 0x1
339 ; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
340 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
341 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
342 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
343 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
344 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
345 ; GFX10-DENORM-NEXT: s_mov_b32 s10, s2
346 ; GFX10-DENORM-NEXT: s_mov_b32 s11, s3
347 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
348 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
349 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
350 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
351 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
352 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
353 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
354 ; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
355 ; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
356 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
357 ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
358 ; GFX10-DENORM-NEXT: s_endpgm
360 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a:
361 ; GFX11-FLUSH: ; %bb.0:
362 ; GFX11-FLUSH-NEXT: s_clause 0x1
363 ; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
364 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
365 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
366 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
367 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
368 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
369 ; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
370 ; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
371 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
372 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
373 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
374 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
375 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
376 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
377 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
378 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
379 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
380 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
381 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
382 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
383 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
384 ; GFX11-FLUSH-NEXT: s_nop 0
385 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
386 ; GFX11-FLUSH-NEXT: s_endpgm
388 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_a:
389 ; GFX11-DENORM: ; %bb.0:
390 ; GFX11-DENORM-NEXT: s_clause 0x1
391 ; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
392 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
393 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
394 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
395 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
396 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
397 ; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
398 ; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
399 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
400 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
401 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
402 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
403 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
404 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
405 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
406 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
407 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
408 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
409 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
410 ; GFX11-DENORM-NEXT: s_nop 0
411 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
412 ; GFX11-DENORM-NEXT: s_endpgm
415 ptr addrspace(1) %c) {
416 %b.val = load volatile half, ptr addrspace(1) %b
417 %c.val = load volatile half, ptr addrspace(1) %c
418 %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
419 store half %r.val, ptr addrspace(1) %r
423 define amdgpu_kernel void @fmuladd_f16_imm_b(
424 ; SI-LABEL: fmuladd_f16_imm_b:
426 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
427 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
428 ; SI-NEXT: s_mov_b32 s3, 0xf000
429 ; SI-NEXT: s_mov_b32 s2, -1
430 ; SI-NEXT: s_mov_b32 s14, s2
431 ; SI-NEXT: s_waitcnt lgkmcnt(0)
432 ; SI-NEXT: s_mov_b32 s12, s6
433 ; SI-NEXT: s_mov_b32 s13, s7
434 ; SI-NEXT: s_mov_b32 s15, s3
435 ; SI-NEXT: s_mov_b32 s10, s2
436 ; SI-NEXT: s_mov_b32 s11, s3
437 ; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
438 ; SI-NEXT: s_waitcnt vmcnt(0)
439 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
440 ; SI-NEXT: s_waitcnt vmcnt(0)
441 ; SI-NEXT: s_mov_b32 s0, s4
442 ; SI-NEXT: s_mov_b32 s1, s5
443 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
444 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
445 ; SI-NEXT: v_madmk_f32 v0, v0, 0x40400000, v1
446 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
447 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
450 ; VI-FLUSH-LABEL: fmuladd_f16_imm_b:
452 ; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
453 ; VI-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
454 ; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
455 ; VI-FLUSH-NEXT: s_mov_b32 s2, -1
456 ; VI-FLUSH-NEXT: s_mov_b32 s14, s2
457 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
458 ; VI-FLUSH-NEXT: s_mov_b32 s12, s6
459 ; VI-FLUSH-NEXT: s_mov_b32 s13, s7
460 ; VI-FLUSH-NEXT: s_mov_b32 s15, s3
461 ; VI-FLUSH-NEXT: s_mov_b32 s10, s2
462 ; VI-FLUSH-NEXT: s_mov_b32 s11, s3
463 ; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
464 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
465 ; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
466 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
467 ; VI-FLUSH-NEXT: s_mov_b32 s0, s4
468 ; VI-FLUSH-NEXT: s_mov_b32 s1, s5
469 ; VI-FLUSH-NEXT: v_madmk_f16 v0, v0, 0x4200, v1
470 ; VI-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
471 ; VI-FLUSH-NEXT: s_endpgm
473 ; VI-DENORM-LABEL: fmuladd_f16_imm_b:
474 ; VI-DENORM: ; %bb.0:
475 ; VI-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
476 ; VI-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
477 ; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
478 ; VI-DENORM-NEXT: s_mov_b32 s2, -1
479 ; VI-DENORM-NEXT: s_mov_b32 s14, s2
480 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
481 ; VI-DENORM-NEXT: s_mov_b32 s12, s6
482 ; VI-DENORM-NEXT: s_mov_b32 s13, s7
483 ; VI-DENORM-NEXT: s_mov_b32 s15, s3
484 ; VI-DENORM-NEXT: s_mov_b32 s10, s2
485 ; VI-DENORM-NEXT: s_mov_b32 s11, s3
486 ; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
487 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
488 ; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
489 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
490 ; VI-DENORM-NEXT: s_mov_b32 s0, s4
491 ; VI-DENORM-NEXT: s_movk_i32 s4, 0x4200
492 ; VI-DENORM-NEXT: s_mov_b32 s1, s5
493 ; VI-DENORM-NEXT: v_fma_f16 v0, v0, s4, v1
494 ; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
495 ; VI-DENORM-NEXT: s_endpgm
497 ; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b:
498 ; GFX10-FLUSH: ; %bb.0:
499 ; GFX10-FLUSH-NEXT: s_clause 0x1
500 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
501 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
502 ; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
503 ; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
504 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
505 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
506 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
507 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
508 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
509 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
510 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
511 ; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
512 ; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
513 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
514 ; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
515 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
516 ; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
517 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
518 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
519 ; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
520 ; GFX10-FLUSH-NEXT: s_endpgm
522 ; GFX10-DENORM-LABEL: fmuladd_f16_imm_b:
523 ; GFX10-DENORM: ; %bb.0:
524 ; GFX10-DENORM-NEXT: s_clause 0x1
525 ; GFX10-DENORM-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
526 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
527 ; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
528 ; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
529 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
530 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
531 ; GFX10-DENORM-NEXT: s_mov_b32 s10, s2
532 ; GFX10-DENORM-NEXT: s_mov_b32 s11, s3
533 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
534 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
535 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
536 ; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc
537 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
538 ; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc
539 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
540 ; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
541 ; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
542 ; GFX10-DENORM-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
543 ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
544 ; GFX10-DENORM-NEXT: s_endpgm
546 ; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b:
547 ; GFX11-FLUSH: ; %bb.0:
548 ; GFX11-FLUSH-NEXT: s_clause 0x1
549 ; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
550 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
551 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
552 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
553 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
554 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
555 ; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
556 ; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
557 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
558 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
559 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
560 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
561 ; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
562 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
563 ; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
564 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
565 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
566 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
567 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
568 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
569 ; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
570 ; GFX11-FLUSH-NEXT: s_nop 0
571 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
572 ; GFX11-FLUSH-NEXT: s_endpgm
574 ; GFX11-DENORM-LABEL: fmuladd_f16_imm_b:
575 ; GFX11-DENORM: ; %bb.0:
576 ; GFX11-DENORM-NEXT: s_clause 0x1
577 ; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
578 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
579 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
580 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
581 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
582 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
583 ; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
584 ; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
585 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
587 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
588 ; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
589 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
590 ; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
591 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
592 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
593 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
594 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
595 ; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
596 ; GFX11-DENORM-NEXT: s_nop 0
597 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
598 ; GFX11-DENORM-NEXT: s_endpgm
601 ptr addrspace(1) %c) {
602 %a.val = load volatile half, ptr addrspace(1) %a
603 %c.val = load volatile half, ptr addrspace(1) %c
604 %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
605 store half %r.val, ptr addrspace(1) %r
609 define amdgpu_kernel void @fmuladd_v2f16(
610 ; SI-LABEL: fmuladd_v2f16:
612 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
613 ; SI-NEXT: s_mov_b32 s11, 0xf000
614 ; SI-NEXT: s_mov_b32 s10, -1
615 ; SI-NEXT: s_mov_b32 s14, s10
616 ; SI-NEXT: s_mov_b32 s15, s11
617 ; SI-NEXT: s_waitcnt lgkmcnt(0)
618 ; SI-NEXT: s_mov_b32 s12, s2
619 ; SI-NEXT: s_mov_b32 s13, s3
620 ; SI-NEXT: s_mov_b32 s16, s4
621 ; SI-NEXT: s_mov_b32 s17, s5
622 ; SI-NEXT: s_mov_b32 s18, s10
623 ; SI-NEXT: s_mov_b32 s19, s11
624 ; SI-NEXT: s_mov_b32 s4, s6
625 ; SI-NEXT: s_mov_b32 s5, s7
626 ; SI-NEXT: s_mov_b32 s6, s10
627 ; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
628 ; SI-NEXT: s_mov_b32 s7, s11
629 ; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
630 ; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
631 ; SI-NEXT: s_mov_b32 s8, s0
632 ; SI-NEXT: s_mov_b32 s9, s1
633 ; SI-NEXT: s_waitcnt vmcnt(2)
634 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
635 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
636 ; SI-NEXT: s_waitcnt vmcnt(1)
637 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
638 ; SI-NEXT: s_waitcnt vmcnt(0)
639 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
640 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
641 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
642 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
643 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
644 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
645 ; SI-NEXT: v_mac_f32_e32 v5, v0, v4
646 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v5
647 ; SI-NEXT: v_mac_f32_e32 v2, v3, v1
648 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
649 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
650 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
651 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
654 ; VI-FLUSH-LABEL: fmuladd_v2f16:
656 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
657 ; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
658 ; VI-FLUSH-NEXT: s_mov_b32 s10, -1
659 ; VI-FLUSH-NEXT: s_mov_b32 s14, s10
660 ; VI-FLUSH-NEXT: s_mov_b32 s15, s11
661 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
662 ; VI-FLUSH-NEXT: s_mov_b32 s12, s2
663 ; VI-FLUSH-NEXT: s_mov_b32 s13, s3
664 ; VI-FLUSH-NEXT: s_mov_b32 s16, s4
665 ; VI-FLUSH-NEXT: s_mov_b32 s17, s5
666 ; VI-FLUSH-NEXT: s_mov_b32 s18, s10
667 ; VI-FLUSH-NEXT: s_mov_b32 s19, s11
668 ; VI-FLUSH-NEXT: s_mov_b32 s4, s6
669 ; VI-FLUSH-NEXT: s_mov_b32 s5, s7
670 ; VI-FLUSH-NEXT: s_mov_b32 s6, s10
671 ; VI-FLUSH-NEXT: s_mov_b32 s7, s11
672 ; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
673 ; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
674 ; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
675 ; VI-FLUSH-NEXT: s_mov_b32 s8, s0
676 ; VI-FLUSH-NEXT: s_mov_b32 s9, s1
677 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
678 ; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
679 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
680 ; VI-FLUSH-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
681 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
682 ; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
683 ; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
684 ; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
685 ; VI-FLUSH-NEXT: s_endpgm
687 ; VI-DENORM-LABEL: fmuladd_v2f16:
688 ; VI-DENORM: ; %bb.0:
689 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
690 ; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
691 ; VI-DENORM-NEXT: s_mov_b32 s10, -1
692 ; VI-DENORM-NEXT: s_mov_b32 s14, s10
693 ; VI-DENORM-NEXT: s_mov_b32 s15, s11
694 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
695 ; VI-DENORM-NEXT: s_mov_b32 s16, s4
696 ; VI-DENORM-NEXT: s_mov_b32 s17, s5
697 ; VI-DENORM-NEXT: s_mov_b32 s4, s6
698 ; VI-DENORM-NEXT: s_mov_b32 s5, s7
699 ; VI-DENORM-NEXT: s_mov_b32 s6, s10
700 ; VI-DENORM-NEXT: s_mov_b32 s7, s11
701 ; VI-DENORM-NEXT: s_mov_b32 s12, s2
702 ; VI-DENORM-NEXT: s_mov_b32 s13, s3
703 ; VI-DENORM-NEXT: s_mov_b32 s18, s10
704 ; VI-DENORM-NEXT: s_mov_b32 s19, s11
705 ; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
706 ; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
707 ; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
708 ; VI-DENORM-NEXT: s_mov_b32 s8, s0
709 ; VI-DENORM-NEXT: s_mov_b32 s9, s1
710 ; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
711 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
712 ; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
713 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v1
714 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
715 ; VI-DENORM-NEXT: v_lshrrev_b32_e32 v5, 16, v2
716 ; VI-DENORM-NEXT: v_fma_f16 v3, v5, v4, v3
717 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
718 ; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
719 ; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
720 ; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
721 ; VI-DENORM-NEXT: s_endpgm
723 ; GFX10-FLUSH-LABEL: fmuladd_v2f16:
724 ; GFX10-FLUSH: ; %bb.0:
725 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
726 ; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
727 ; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
728 ; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
729 ; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
730 ; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
731 ; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
732 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
733 ; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
734 ; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
735 ; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
736 ; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
737 ; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
738 ; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
739 ; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
740 ; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
741 ; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
742 ; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
743 ; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
744 ; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0
745 ; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
746 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
747 ; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
748 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
749 ; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
750 ; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
751 ; GFX10-FLUSH-NEXT: s_endpgm
753 ; GFX10-DENORM-LABEL: fmuladd_v2f16:
754 ; GFX10-DENORM: ; %bb.0:
755 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
756 ; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
757 ; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
758 ; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
759 ; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
760 ; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
761 ; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
762 ; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
763 ; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
764 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
765 ; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
766 ; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
767 ; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
768 ; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
769 ; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
770 ; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
771 ; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0
772 ; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
773 ; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
774 ; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
775 ; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
776 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
777 ; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
778 ; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
779 ; GFX10-DENORM-NEXT: s_endpgm
781 ; GFX11-FLUSH-LABEL: fmuladd_v2f16:
782 ; GFX11-FLUSH: ; %bb.0:
783 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
784 ; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
785 ; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
786 ; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
787 ; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
788 ; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
789 ; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
790 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
791 ; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
792 ; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
793 ; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
794 ; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
795 ; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
796 ; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
797 ; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
798 ; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
799 ; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
800 ; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
801 ; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
802 ; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0
803 ; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
804 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
805 ; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
806 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
807 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
808 ; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
809 ; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0
810 ; GFX11-FLUSH-NEXT: s_nop 0
811 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
812 ; GFX11-FLUSH-NEXT: s_endpgm
814 ; GFX11-DENORM-LABEL: fmuladd_v2f16:
815 ; GFX11-DENORM: ; %bb.0:
816 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
817 ; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
818 ; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
819 ; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
820 ; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
821 ; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
822 ; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
823 ; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
824 ; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
825 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
826 ; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
827 ; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
828 ; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
829 ; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
830 ; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
831 ; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
832 ; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
833 ; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
834 ; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
835 ; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
836 ; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
837 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
838 ; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
839 ; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0
840 ; GFX11-DENORM-NEXT: s_nop 0
841 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
842 ; GFX11-DENORM-NEXT: s_endpgm
846 ptr addrspace(1) %c) {
847 %a.val = load <2 x half>, ptr addrspace(1) %a
848 %b.val = load <2 x half>, ptr addrspace(1) %b
849 %c.val = load <2 x half>, ptr addrspace(1) %c
850 %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
851 store <2 x half> %r.val, ptr addrspace(1) %r