1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s
4 ; GCN-LABEL: {{^}}mac_f16:
5 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
6 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
7 ; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]]
8 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
9 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
10 ; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
11 ; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
12 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
13 ; SI: buffer_store_short v[[R_F16]]
14 ; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
15 ; VI: buffer_store_short v[[C_F16]]
17 define amdgpu_kernel void @mac_f16(
18 half addrspace(1)* %r,
19 half addrspace(1)* %a,
20 half addrspace(1)* %b,
21 half addrspace(1)* %c) #0 {
23 %a.val = load half, half addrspace(1)* %a
24 %b.val = load half, half addrspace(1)* %b
25 %c.val = load half, half addrspace(1)* %c
27 %t.val = fmul half %a.val, %b.val
28 %r.val = fadd half %t.val, %c.val
30 store half %r.val, half addrspace(1)* %r
34 ; GCN-LABEL: {{^}}mac_f16_same_add:
35 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
36 ; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
38 ; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
39 ; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
41 define amdgpu_kernel void @mac_f16_same_add(
42 half addrspace(1)* %r0,
43 half addrspace(1)* %r1,
44 half addrspace(1)* %a,
45 half addrspace(1)* %b,
46 half addrspace(1)* %c,
47 half addrspace(1)* %d,
48 half addrspace(1)* %e) #0 {
50 %a.val = load half, half addrspace(1)* %a
51 %b.val = load half, half addrspace(1)* %b
52 %c.val = load half, half addrspace(1)* %c
53 %d.val = load half, half addrspace(1)* %d
54 %e.val = load half, half addrspace(1)* %e
56 %t0.val = fmul half %a.val, %b.val
57 %r0.val = fadd half %t0.val, %c.val
59 %t1.val = fmul half %d.val, %e.val
60 %r1.val = fadd half %t1.val, %c.val
62 store half %r0.val, half addrspace(1)* %r0
63 store half %r1.val, half addrspace(1)* %r1
67 ; GCN-LABEL: {{^}}mac_f16_neg_a:
68 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
69 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
70 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
71 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
74 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
76 define amdgpu_kernel void @mac_f16_neg_a(
77 half addrspace(1)* %r,
78 half addrspace(1)* %a,
79 half addrspace(1)* %b,
80 half addrspace(1)* %c) #0 {
82 %a.val = load half, half addrspace(1)* %a
83 %b.val = load half, half addrspace(1)* %b
84 %c.val = load half, half addrspace(1)* %c
86 %a.neg = fsub half -0.0, %a.val
87 %t.val = fmul half %a.neg, %b.val
88 %r.val = fadd half %t.val, %c.val
90 store half %r.val, half addrspace(1)* %r
94 ; GCN-LABEL: {{^}}mac_f16_neg_b:
95 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
96 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
97 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
98 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
101 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
103 define amdgpu_kernel void @mac_f16_neg_b(
104 half addrspace(1)* %r,
105 half addrspace(1)* %a,
106 half addrspace(1)* %b,
107 half addrspace(1)* %c) #0 {
109 %a.val = load half, half addrspace(1)* %a
110 %b.val = load half, half addrspace(1)* %b
111 %c.val = load half, half addrspace(1)* %c
113 %b.neg = fsub half -0.0, %b.val
114 %t.val = fmul half %a.val, %b.neg
115 %r.val = fadd half %t.val, %c.val
117 store half %r.val, half addrspace(1)* %r
121 ; GCN-LABEL: {{^}}mac_f16_neg_c:
122 ; SI: v_cvt_f32_f16_e32
123 ; SI: v_cvt_f32_f16_e32
124 ; SI: v_cvt_f32_f16_e32
125 ; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
128 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
130 define amdgpu_kernel void @mac_f16_neg_c(
131 half addrspace(1)* %r,
132 half addrspace(1)* %a,
133 half addrspace(1)* %b,
134 half addrspace(1)* %c) #0 {
136 %a.val = load half, half addrspace(1)* %a
137 %b.val = load half, half addrspace(1)* %b
138 %c.val = load half, half addrspace(1)* %c
140 %c.neg = fsub half -0.0, %c.val
141 %t.val = fmul half %a.val, %b.val
142 %r.val = fadd half %t.val, %c.neg
144 store half %r.val, half addrspace(1)* %r
148 ; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math:
149 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
150 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
151 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
152 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
154 define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
155 half addrspace(1)* %r,
156 half addrspace(1)* %a,
157 half addrspace(1)* %b,
158 half addrspace(1)* %c) #0 {
160 %a.val = load half, half addrspace(1)* %a
161 %b.val = load half, half addrspace(1)* %b
162 %c.val = load half, half addrspace(1)* %c
164 %a.neg = fsub half 0.0, %a.val
165 %t.val = fmul half %a.neg, %b.val
166 %r.val = fadd half %t.val, %c.val
168 store half %r.val, half addrspace(1)* %r
172 ; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math:
173 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
174 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
175 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
176 ; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
178 define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
179 half addrspace(1)* %r,
180 half addrspace(1)* %a,
181 half addrspace(1)* %b,
182 half addrspace(1)* %c) #0 {
184 %a.val = load half, half addrspace(1)* %a
185 %b.val = load half, half addrspace(1)* %b
186 %c.val = load half, half addrspace(1)* %c
188 %b.neg = fsub half 0.0, %b.val
189 %t.val = fmul half %a.val, %b.neg
190 %r.val = fadd half %t.val, %c.val
192 store half %r.val, half addrspace(1)* %r
196 ; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math:
197 ; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
198 ; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
199 ; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
200 ; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
202 define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
203 half addrspace(1)* %r,
204 half addrspace(1)* %a,
205 half addrspace(1)* %b,
206 half addrspace(1)* %c) #0 {
208 %a.val = load half, half addrspace(1)* %a
209 %b.val = load half, half addrspace(1)* %b
210 %c.val = load half, half addrspace(1)* %c
212 %c.neg = fsub half 0.0, %c.val
213 %t.val = fmul half %a.val, %b.val
214 %r.val = fadd half %t.val, %c.neg
216 store half %r.val, half addrspace(1)* %r
220 ; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math:
221 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
222 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
223 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
224 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
227 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
229 define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
230 half addrspace(1)* %r,
231 half addrspace(1)* %a,
232 half addrspace(1)* %b,
233 half addrspace(1)* %c) #1 {
235 %a.val = load half, half addrspace(1)* %a
236 %b.val = load half, half addrspace(1)* %b
237 %c.val = load half, half addrspace(1)* %c
239 %a.neg = fsub half 0.0, %a.val
240 %t.val = fmul half %a.neg, %b.val
241 %r.val = fadd half %t.val, %c.val
243 store half %r.val, half addrspace(1)* %r
247 ; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math:
248 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
249 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
250 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
251 ; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
254 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
256 define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
257 half addrspace(1)* %r,
258 half addrspace(1)* %a,
259 half addrspace(1)* %b,
260 half addrspace(1)* %c) #1 {
262 %a.val = load half, half addrspace(1)* %a
263 %b.val = load half, half addrspace(1)* %b
264 %c.val = load half, half addrspace(1)* %c
266 %b.neg = fsub half 0.0, %b.val
267 %t.val = fmul half %a.val, %b.neg
268 %r.val = fadd half %t.val, %c.val
270 store half %r.val, half addrspace(1)* %r
274 ; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math:
275 ; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
276 ; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
277 ; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
278 ; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
281 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
283 define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
284 half addrspace(1)* %r,
285 half addrspace(1)* %a,
286 half addrspace(1)* %b,
287 half addrspace(1)* %c) #1 {
289 %a.val = load half, half addrspace(1)* %a
290 %b.val = load half, half addrspace(1)* %b
291 %c.val = load half, half addrspace(1)* %c
293 %c.neg = fsub half 0.0, %c.val
294 %t.val = fmul half %a.val, %b.val
295 %r.val = fadd half %t.val, %c.neg
297 store half %r.val, half addrspace(1)* %r
301 ; GCN-LABEL: {{^}}mac_v2f16:
302 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
303 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
304 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
306 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
307 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
308 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
310 ; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
311 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
312 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
314 ; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
315 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
316 ; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
318 ; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
319 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
320 ; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
321 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
322 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
324 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
326 ; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
327 ; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
328 ; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
329 ; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
331 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
333 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
335 define amdgpu_kernel void @mac_v2f16(
336 <2 x half> addrspace(1)* %r,
337 <2 x half> addrspace(1)* %a,
338 <2 x half> addrspace(1)* %b,
339 <2 x half> addrspace(1)* %c) #0 {
341 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
342 call void @llvm.amdgcn.s.barrier() #2
343 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
344 call void @llvm.amdgcn.s.barrier() #2
345 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
347 %t.val = fmul <2 x half> %a.val, %b.val
348 %r.val = fadd <2 x half> %t.val, %c.val
350 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
354 ; GCN-LABEL: {{^}}mac_v2f16_same_add:
355 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
356 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
357 ; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
358 ; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
360 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
361 ; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
362 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
363 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
366 define amdgpu_kernel void @mac_v2f16_same_add(
367 <2 x half> addrspace(1)* %r0,
368 <2 x half> addrspace(1)* %r1,
369 <2 x half> addrspace(1)* %a,
370 <2 x half> addrspace(1)* %b,
371 <2 x half> addrspace(1)* %c,
372 <2 x half> addrspace(1)* %d,
373 <2 x half> addrspace(1)* %e) #0 {
375 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
376 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
377 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
378 %d.val = load <2 x half>, <2 x half> addrspace(1)* %d
379 %e.val = load <2 x half>, <2 x half> addrspace(1)* %e
381 %t0.val = fmul <2 x half> %a.val, %b.val
382 %r0.val = fadd <2 x half> %t0.val, %c.val
384 %t1.val = fmul <2 x half> %d.val, %e.val
385 %r1.val = fadd <2 x half> %t1.val, %c.val
387 store <2 x half> %r0.val, <2 x half> addrspace(1)* %r0
388 store <2 x half> %r1.val, <2 x half> addrspace(1)* %r1
392 ; GCN-LABEL: {{^}}mac_v2f16_neg_a:
393 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
394 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
396 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
397 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
400 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
401 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
403 define amdgpu_kernel void @mac_v2f16_neg_a(
404 <2 x half> addrspace(1)* %r,
405 <2 x half> addrspace(1)* %a,
406 <2 x half> addrspace(1)* %b,
407 <2 x half> addrspace(1)* %c) #0 {
409 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
410 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
411 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
413 %a.neg = fsub <2 x half> <half -0.0, half -0.0>, %a.val
414 %t.val = fmul <2 x half> %a.neg, %b.val
415 %r.val = fadd <2 x half> %t.val, %c.val
417 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
421 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
422 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
423 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
424 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
425 ; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
429 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
430 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
432 define amdgpu_kernel void @mac_v2f16_neg_b(
433 <2 x half> addrspace(1)* %r,
434 <2 x half> addrspace(1)* %a,
435 <2 x half> addrspace(1)* %b,
436 <2 x half> addrspace(1)* %c) #0 {
438 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
439 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
440 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
442 %b.neg = fsub <2 x half> <half -0.0, half -0.0>, %b.val
443 %t.val = fmul <2 x half> %a.val, %b.neg
444 %r.val = fadd <2 x half> %t.val, %c.val
446 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
450 ; GCN-LABEL: {{^}}mac_v2f16_neg_c:
451 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
452 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
453 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
454 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
455 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
456 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
458 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
459 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
462 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
463 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
465 define amdgpu_kernel void @mac_v2f16_neg_c(
466 <2 x half> addrspace(1)* %r,
467 <2 x half> addrspace(1)* %a,
468 <2 x half> addrspace(1)* %b,
469 <2 x half> addrspace(1)* %c) #0 {
471 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
472 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
473 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
475 %c.neg = fsub <2 x half> <half -0.0, half -0.0>, %c.val
476 %t.val = fmul <2 x half> %a.val, %b.val
477 %r.val = fadd <2 x half> %t.val, %c.neg
479 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
483 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math:
485 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
486 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
487 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
488 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
490 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
491 ; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
492 ; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
493 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
494 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
497 define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
498 <2 x half> addrspace(1)* %r,
499 <2 x half> addrspace(1)* %a,
500 <2 x half> addrspace(1)* %b,
501 <2 x half> addrspace(1)* %c) #0 {
503 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
504 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
505 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
507 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
508 %t.val = fmul <2 x half> %a.neg, %b.val
509 %r.val = fadd <2 x half> %t.val, %c.val
511 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
515 ; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
517 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
518 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
519 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
520 ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
522 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
523 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
524 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
525 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
526 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
529 define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
530 <2 x half> addrspace(1)* %r,
531 <2 x half> addrspace(1)* %a,
532 <2 x half> addrspace(1)* %b,
533 <2 x half> addrspace(1)* %c) #0 {
535 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
536 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
537 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
539 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
540 %t.val = fmul <2 x half> %a.val, %b.neg
541 %r.val = fadd <2 x half> %t.val, %c.val
543 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
547 ; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
549 ; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
550 ; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
551 ; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
552 ; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
554 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
555 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
556 ; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
557 ; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
558 ; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
561 define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
562 <2 x half> addrspace(1)* %r,
563 <2 x half> addrspace(1)* %a,
564 <2 x half> addrspace(1)* %b,
565 <2 x half> addrspace(1)* %c) #0 {
567 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
568 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
569 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
571 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
572 %t.val = fmul <2 x half> %a.val, %b.val
573 %r.val = fadd <2 x half> %t.val, %c.neg
575 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
579 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math:
580 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
581 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
582 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
583 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
584 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
585 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
587 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
588 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
591 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
592 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
594 define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
595 <2 x half> addrspace(1)* %r,
596 <2 x half> addrspace(1)* %a,
597 <2 x half> addrspace(1)* %b,
598 <2 x half> addrspace(1)* %c) #1 {
600 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
601 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
602 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
604 %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
605 %t.val = fmul <2 x half> %a.neg, %b.val
606 %r.val = fadd <2 x half> %t.val, %c.val
608 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
612 ; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math:
613 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
614 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
615 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
616 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
617 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
618 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
620 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
621 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
624 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
625 ; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
627 define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
628 <2 x half> addrspace(1)* %r,
629 <2 x half> addrspace(1)* %a,
630 <2 x half> addrspace(1)* %b,
631 <2 x half> addrspace(1)* %c) #1 {
633 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
634 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
635 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
637 %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
638 %t.val = fmul <2 x half> %a.val, %b.neg
639 %r.val = fadd <2 x half> %t.val, %c.val
641 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
645 ; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math:
646 ; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
647 ; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
648 ; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
649 ; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
650 ; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
651 ; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
653 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
654 ; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
657 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
658 ; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
660 define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
661 <2 x half> addrspace(1)* %r,
662 <2 x half> addrspace(1)* %a,
663 <2 x half> addrspace(1)* %b,
664 <2 x half> addrspace(1)* %c) #1 {
666 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
667 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
668 %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
670 %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
671 %t.val = fmul <2 x half> %a.val, %b.val
672 %r.val = fadd <2 x half> %t.val, %c.neg
674 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
678 declare void @llvm.amdgcn.s.barrier() #2
680 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
681 attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
682 attributes #2 = { nounwind convergent }