1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
4 ; GCN-LABEL: {{^}}mac_f16:
5 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
6 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
7 ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]]
8 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
9 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
10 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
11 ; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
12 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
13 ; SI: buffer_store_short v[[R_F16]]
14 ; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
15 ; VI: buffer_store_short v[[C_F16]]
17 define amdgpu_kernel void @mac_f16(
18 half addrspace(1)* %r,
19 half addrspace(1)* %a,
20 half addrspace(1)* %b,
21 half addrspace(1)* %c) #0 {
23 %a.val = load half, half addrspace(1)* %a
24 %b.val = load half, half addrspace(1)* %b
25 %c.val = load half, half addrspace(1)* %c
27 %t.val = fmul half %a.val, %b.val
28 %r.val = fadd half %t.val, %c.val
30 store half %r.val, half addrspace(1)* %r
34 ; GCN-LABEL: {{^}}mac_f16_same_add:
35 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
36 ; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
38 ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
39 ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
41 define amdgpu_kernel void @mac_f16_same_add(
42 half addrspace(1)* %r0,
43 half addrspace(1)* %r1,
44 half addrspace(1)* %a,
45 half addrspace(1)* %b,
46 half addrspace(1)* %c,
47 half addrspace(1)* %d,
48 half addrspace(1)* %e) #0 {
50 %a.val = load half, half addrspace(1)* %a
51 %b.val = load half, half addrspace(1)* %b
52 %c.val = load half, half addrspace(1)* %c
53 %d.val = load half, half addrspace(1)* %d
54 %e.val = load half, half addrspace(1)* %e
56 %t0.val = fmul half %a.val, %b.val
57 %r0.val = fadd half %t0.val, %c.val
59 %t1.val = fmul half %d.val, %e.val
60 %r1.val = fadd half %t1.val, %c.val
62 store half %r0.val, half addrspace(1)* %r0
63 store half %r1.val, half addrspace(1)* %r1
67 ; GCN-LABEL: {{^}}mac_f16_neg_a:
68 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
69 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
70 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
71 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
74 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
76 define amdgpu_kernel void @mac_f16_neg_a(
77 half addrspace(1)* %r,
78 half addrspace(1)* %a,
79 half addrspace(1)* %b,
80 half addrspace(1)* %c) #0 {
82 %a.val = load half, half addrspace(1)* %a
83 %b.val = load half, half addrspace(1)* %b
84 %c.val = load half, half addrspace(1)* %c
86 %a.neg = fsub half -0.0, %a.val
87 %t.val = fmul half %a.neg, %b.val
88 %r.val = fadd half %t.val, %c.val
90 store half %r.val, half addrspace(1)* %r
94 ; GCN-LABEL: {{^}}mac_f16_neg_b:
95 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
96 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
97 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
98 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
101 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
103 define amdgpu_kernel void @mac_f16_neg_b(
104 half addrspace(1)* %r,
105 half addrspace(1)* %a,
106 half addrspace(1)* %b,
107 half addrspace(1)* %c) #0 {
109 %a.val = load half, half addrspace(1)* %a
110 %b.val = load half, half addrspace(1)* %b
111 %c.val = load half, half addrspace(1)* %c
113 %b.neg = fsub half -0.0, %b.val
114 %t.val = fmul half %a.val, %b.neg
115 %r.val = fadd half %t.val, %c.val
117 store half %r.val, half addrspace(1)* %r
121 ; GCN-LABEL: {{^}}mac_f16_neg_c:
122 ; SI: v_cvt_f32_f16_e32
123 ; SI: v_cvt_f32_f16_e32
124 ; SI: v_cvt_f32_f16_e32
125 ; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
128 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
130 define amdgpu_kernel void @mac_f16_neg_c(
131 half addrspace(1)* %r,
132 half addrspace(1)* %a,
133 half addrspace(1)* %b,
134 half addrspace(1)* %c) #0 {
136 %a.val = load half, half addrspace(1)* %a
137 %b.val = load half, half addrspace(1)* %b
138 %c.val = load half, half addrspace(1)* %c
140 %c.neg = fsub half -0.0, %c.val
141 %t.val = fmul half %a.val, %b.val
142 %r.val = fadd half %t.val, %c.neg
144 store half %r.val, half addrspace(1)* %r
148 ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
149 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
150 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
151 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
152 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
154 define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
155 half addrspace(1)* %r,
156 half addrspace(1)* %a,
157 half addrspace(1)* %b,
158 half addrspace(1)* %c) #0 {
160 %a.val = load half, half addrspace(1)* %a
161 %b.val = load half, half addrspace(1)* %b
162 %c.val = load half, half addrspace(1)* %c
164 %a.neg = fsub half 0.0, %a.val
165 %t.val = fmul half %a.neg, %b.val
166 %r.val = fadd half %t.val, %c.val
168 store half %r.val, half addrspace(1)* %r
172 ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
173 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
174 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
175 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
176 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
178 define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
179 half addrspace(1)* %r,
180 half addrspace(1)* %a,
181 half addrspace(1)* %b,
182 half addrspace(1)* %c) #0 {
184 %a.val = load half, half addrspace(1)* %a
185 %b.val = load half, half addrspace(1)* %b
186 %c.val = load half, half addrspace(1)* %c
188 %b.neg = fsub half 0.0, %b.val
189 %t.val = fmul half %a.val, %b.neg
190 %r.val = fadd half %t.val, %c.val
192 store half %r.val, half addrspace(1)* %r
196 ; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math:
197 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
198 ; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
199 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
200 ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
202 define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
203 half addrspace(1)* %r,
204 half addrspace(1)* %a,
205 half addrspace(1)* %b,
206 half addrspace(1)* %c) #0 {
208 %a.val = load half, half addrspace(1)* %a
209 %b.val = load half, half addrspace(1)* %b
210 %c.val = load half, half addrspace(1)* %c
212 %c.neg = fsub half 0.0, %c.val
213 %t.val = fmul half %a.val, %b.val
214 %r.val = fadd half %t.val, %c.neg
216 store half %r.val, half addrspace(1)* %r
220 ; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math:
221 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
222 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
223 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
224 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
227 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
229 define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
230 half addrspace(1)* %r,
231 half addrspace(1)* %a,
232 half addrspace(1)* %b,
233 half addrspace(1)* %c) #1 {
235 %a.val = load half, half addrspace(1)* %a
236 %b.val = load half, half addrspace(1)* %b
237 %c.val = load half, half addrspace(1)* %c
239 %a.neg = fsub half 0.0, %a.val
240 %t.val = fmul half %a.neg, %b.val
241 %r.val = fadd half %t.val, %c.val
243 store half %r.val, half addrspace(1)* %r
247 ; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math:
248 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
249 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
250 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
251 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
254 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
256 define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
257 half addrspace(1)* %r,
258 half addrspace(1)* %a,
259 half addrspace(1)* %b,
260 half addrspace(1)* %c) #1 {
262 %a.val = load half, half addrspace(1)* %a
263 %b.val = load half, half addrspace(1)* %b
264 %c.val = load half, half addrspace(1)* %c
266 %b.neg = fsub half 0.0, %b.val
267 %t.val = fmul half %a.val, %b.neg
268 %r.val = fadd half %t.val, %c.val
270 store half %r.val, half addrspace(1)* %r
274 ; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math:
275 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
276 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
277 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
278 ; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
281 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
283 define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
284 half addrspace(1)* %r,
285 half addrspace(1)* %a,
286 half addrspace(1)* %b,
287 half addrspace(1)* %c) #1 {
289 %a.val = load half, half addrspace(1)* %a
290 %b.val = load half, half addrspace(1)* %b
291 %c.val = load half, half addrspace(1)* %c
293 %c.neg = fsub half 0.0, %c.val
294 %t.val = fmul half %a.val, %b.val
295 %r.val = fadd half %t.val, %c.neg
297 store half %r.val, half addrspace(1)* %r
301 ; GCN-LABEL: {{^}}mac_v2f16:
302 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
303 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
304 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
306 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
307 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
308 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
309 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
310 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
311 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
312 ; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
313 ; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
314 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
315 ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
316 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
317 ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
318 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
319 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
321 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
323 ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
324 ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
325 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
326 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
328 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
330 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
332 define amdgpu_kernel void @mac_v2f16(
333 <2 x half> addrspace(1)* %r,
334 <2 x half> addrspace(1)* %a,
335 <2 x half> addrspace(1)* %b,
336 <2 x half> addrspace(1)* %c) #0 {
338 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
339 call void @llvm.amdgcn.s.barrier() #2
340 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
341 call void @llvm.amdgcn.s.barrier() #2
342 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
344 %t.val = fmul <2 x half> %a.val, %b.val
345 %r.val = fadd <2 x half> %t.val, %c.val
347 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
351 ; GCN-LABEL: {{^}}mac_v2f16_same_add:
352 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
353 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
354 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
355 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
357 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
358 ; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
359 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
360 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
363 define amdgpu_kernel void @mac_v2f16_same_add(
364 <2 x half> addrspace(1)* %r0,
365 <2 x half> addrspace(1)* %r1,
366 <2 x half> addrspace(1)* %a,
367 <2 x half> addrspace(1)* %b,
368 <2 x half> addrspace(1)* %c,
369 <2 x half> addrspace(1)* %d,
370 <2 x half> addrspace(1)* %e) #0 {
372 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
373 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
374 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
375 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
376 %e.val = load <2 x half>, <2 x half> addrspace(1)* %e
378 %t0.val = fmul <2 x half> %a.val, %b.val
379 %r0.val = fadd <2 x half> %t0.val, %c.val
381 %t1.val = fmul <2 x half> %d.val, %e.val
382 %r1.val = fadd <2 x half> %t1.val, %c.val
384 store <2 x half> %r0.val, <2 x half> addrspace(1)* %r0
385 store <2 x half> %r1.val, <2 x half> addrspace(1)* %r1
389 ; GCN-LABEL: {{^}}mac_v2f16_neg_a:
390 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
391 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
393 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
394 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
397 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
398 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
400 define amdgpu_kernel void @mac_v2f16_neg_a(
401 <2 x half> addrspace(1)* %r,
402 <2 x half> addrspace(1)* %a,
403 <2 x half> addrspace(1)* %b,
404 <2 x half> addrspace(1)* %c) #0 {
406 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
407 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
408 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
410 %a.neg = fsub <2 x half> <half -0.0, half -0.0>, %a.val
411 %t.val = fmul <2 x half> %a.neg, %b.val
412 %r.val = fadd <2 x half> %t.val, %c.val
414 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
418 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
419 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
420 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
421 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
422 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
426 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
427 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
429 define amdgpu_kernel void @mac_v2f16_neg_b(
430 <2 x half> addrspace(1)* %r,
431 <2 x half> addrspace(1)* %a,
432 <2 x half> addrspace(1)* %b,
433 <2 x half> addrspace(1)* %c) #0 {
435 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
436 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
437 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
439 %b.neg = fsub <2 x half> <half -0.0, half -0.0>, %b.val
440 %t.val = fmul <2 x half> %a.val, %b.neg
441 %r.val = fadd <2 x half> %t.val, %c.val
443 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
447 ; GCN-LABEL: {{^}}mac_v2f16_neg_c:
448 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
449 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
450 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
451 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
452 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
453 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
455 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
456 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
459 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
460 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
462 define amdgpu_kernel void @mac_v2f16_neg_c(
463 <2 x half> addrspace(1)* %r,
464 <2 x half> addrspace(1)* %a,
465 <2 x half> addrspace(1)* %b,
466 <2 x half> addrspace(1)* %c) #0 {
468 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
469 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
470 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
472 %c.neg = fsub <2 x half> <half -0.0, half -0.0>, %c.val
473 %t.val = fmul <2 x half> %a.val, %b.val
474 %r.val = fadd <2 x half> %t.val, %c.neg
476 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
480 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math:
482 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
483 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
484 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
485 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
487 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
488 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
489 ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
490 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
491 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
494 define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
495 <2 x half> addrspace(1)* %r,
496 <2 x half> addrspace(1)* %a,
497 <2 x half> addrspace(1)* %b,
498 <2 x half> addrspace(1)* %c) #0 {
500 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
501 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
502 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
504 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
505 %t.val = fmul <2 x half> %a.neg, %b.val
506 %r.val = fadd <2 x half> %t.val, %c.val
508 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
512 ; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
514 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
515 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
516 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
517 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
519 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
520 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
521 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
522 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
523 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
526 define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
527 <2 x half> addrspace(1)* %r,
528 <2 x half> addrspace(1)* %a,
529 <2 x half> addrspace(1)* %b,
530 <2 x half> addrspace(1)* %c) #0 {
532 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
533 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
534 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
536 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
537 %t.val = fmul <2 x half> %a.val, %b.neg
538 %r.val = fadd <2 x half> %t.val, %c.val
540 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
544 ; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
546 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
547 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
548 ; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
549 ; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
551 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
552 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
553 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
554 ; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
555 ; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
558 define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
559 <2 x half> addrspace(1)* %r,
560 <2 x half> addrspace(1)* %a,
561 <2 x half> addrspace(1)* %b,
562 <2 x half> addrspace(1)* %c) #0 {
564 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
565 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
566 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
568 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
569 %t.val = fmul <2 x half> %a.val, %b.val
570 %r.val = fadd <2 x half> %t.val, %c.neg
572 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
576 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math:
577 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
578 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
579 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
580 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
581 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
582 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
584 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
585 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
588 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
589 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
591 define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
592 <2 x half> addrspace(1)* %r,
593 <2 x half> addrspace(1)* %a,
594 <2 x half> addrspace(1)* %b,
595 <2 x half> addrspace(1)* %c) #1 {
597 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
598 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
599 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
601 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
602 %t.val = fmul <2 x half> %a.neg, %b.val
603 %r.val = fadd <2 x half> %t.val, %c.val
605 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
609 ; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math:
610 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
611 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
612 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
613 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
614 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
615 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
617 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
618 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
621 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
622 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
624 define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
625 <2 x half> addrspace(1)* %r,
626 <2 x half> addrspace(1)* %a,
627 <2 x half> addrspace(1)* %b,
628 <2 x half> addrspace(1)* %c) #1 {
630 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
631 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
632 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
634 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
635 %t.val = fmul <2 x half> %a.val, %b.neg
636 %r.val = fadd <2 x half> %t.val, %c.val
638 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
642 ; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math:
643 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
644 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
645 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
646 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
647 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
648 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
650 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
651 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
654 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
655 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
657 define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
658 <2 x half> addrspace(1)* %r,
659 <2 x half> addrspace(1)* %a,
660 <2 x half> addrspace(1)* %b,
661 <2 x half> addrspace(1)* %c) #1 {
663 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
664 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
665 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
667 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
668 %t.val = fmul <2 x half> %a.val, %b.val
669 %r.val = fadd <2 x half> %t.val, %c.neg
671 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
675 declare void @llvm.amdgcn.s.barrier() #2
677 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
678 attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
679 attributes #2 = { nounwind convergent }