[clang][modules] Don't prevent translation of FW_Private includes when explicitly...
[llvm-project.git] / llvm / test / CodeGen / AMDGPU / fmuladd.f16.ll
blob4a2f4c4437203b92ea3aab3e39eac12e52267ffe
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s
6 ; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
11 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
13 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
14 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
15 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s
16 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s
18 declare i32 @llvm.amdgcn.workitem.id.x() #1
19 declare half @llvm.fmuladd.f16(half, half, half) #1
20 declare half @llvm.fabs.f16(half) #1
22 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
23 ; VI-FLUSH-LABEL: fmuladd_f16:
24 ; VI-FLUSH:       ; %bb.0:
25 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
26 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
27 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
28 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
29 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
30 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
31 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
32 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
33 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
34 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
35 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
36 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
37 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
38 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
39 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
40 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
41 ; VI-FLUSH-NEXT:    s_endpgm
43 ; VI-DENORM-LABEL: fmuladd_f16:
44 ; VI-DENORM:       ; %bb.0:
45 ; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
46 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
47 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
48 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
49 ; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
50 ; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
51 ; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
52 ; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
53 ; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
54 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
55 ; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
56 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
57 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
58 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
59 ; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
60 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
61 ; VI-DENORM-NEXT:    s_endpgm
63 ; GFX10-FLUSH-LABEL: fmuladd_f16:
64 ; GFX10-FLUSH:       ; %bb.0:
65 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
66 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
67 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
68 ; GFX10-FLUSH-NEXT:    s_clause 0x2
69 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
70 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
71 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
72 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
73 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
74 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
75 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
76 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
77 ; GFX10-FLUSH-NEXT:    s_endpgm
79 ; GFX10-DENORM-LABEL: fmuladd_f16:
80 ; GFX10-DENORM:       ; %bb.0:
81 ; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
82 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
83 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
84 ; GFX10-DENORM-NEXT:    s_clause 0x2
85 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
86 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
87 ; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
88 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
89 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
90 ; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
91 ; GFX10-DENORM-NEXT:    s_endpgm
93 ; GFX11-FLUSH-LABEL: fmuladd_f16:
94 ; GFX11-FLUSH:       ; %bb.0:
95 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
96 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
97 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
98 ; GFX11-FLUSH-NEXT:    s_clause 0x2
99 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
100 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
101 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
102 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
103 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
104 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
105 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
106 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
107 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
108 ; GFX11-FLUSH-NEXT:    s_nop 0
109 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
110 ; GFX11-FLUSH-NEXT:    s_endpgm
112 ; GFX11-DENORM-LABEL: fmuladd_f16:
113 ; GFX11-DENORM:       ; %bb.0:
114 ; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
115 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
116 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
117 ; GFX11-DENORM-NEXT:    s_clause 0x2
118 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
119 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
120 ; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
121 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
122 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
123 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
124 ; GFX11-DENORM-NEXT:    s_nop 0
125 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
126 ; GFX11-DENORM-NEXT:    s_endpgm
127                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
128   %r0 = load half, ptr addrspace(1) %in1
129   %r1 = load half, ptr addrspace(1) %in2
130   %r2 = load half, ptr addrspace(1) %in3
131   %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
132   store half %r3, ptr addrspace(1) %out
133   ret void
136 define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
137 ; VI-FLUSH-LABEL: fmul_fadd_f16:
138 ; VI-FLUSH:       ; %bb.0:
139 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
140 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
141 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
142 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
143 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
144 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
145 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
146 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
147 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
148 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
149 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
150 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
151 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
152 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
153 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
154 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
155 ; VI-FLUSH-NEXT:    s_endpgm
157 ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
158 ; VI-DENORM-CONTRACT:       ; %bb.0:
159 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
160 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
161 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s2
162 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
163 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v2, s4
164 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v3, s5
165 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v4, s6
166 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v5, s7
167 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v6, v[0:1]
168 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3]
169 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5]
170 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s0
171 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
172 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
173 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v6, v2, v3
174 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
175 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
177 ; GFX10-FLUSH-LABEL: fmul_fadd_f16:
178 ; GFX10-FLUSH:       ; %bb.0:
179 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
180 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
181 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
182 ; GFX10-FLUSH-NEXT:    s_clause 0x2
183 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
184 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
185 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
186 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
187 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
188 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
189 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
190 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
191 ; GFX10-FLUSH-NEXT:    s_endpgm
193 ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
194 ; GFX10-DENORM-STRICT:       ; %bb.0:
195 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
196 ; GFX10-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
197 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
198 ; GFX10-DENORM-STRICT-NEXT:    s_clause 0x2
199 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3]
200 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[4:5]
201 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7]
202 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
203 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
204 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
205 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
206 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
207 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
209 ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
210 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
211 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
212 ; GFX10-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
213 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
214 ; GFX10-DENORM-CONTRACT-NEXT:    s_clause 0x2
215 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3]
216 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[4:5]
217 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7]
218 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
219 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
220 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
221 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
223 ; GFX11-FLUSH-LABEL: fmul_fadd_f16:
224 ; GFX11-FLUSH:       ; %bb.0:
225 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
226 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
227 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
228 ; GFX11-FLUSH-NEXT:    s_clause 0x2
229 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
230 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
231 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
232 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
233 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
234 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
235 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
236 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
237 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
238 ; GFX11-FLUSH-NEXT:    s_nop 0
239 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
240 ; GFX11-FLUSH-NEXT:    s_endpgm
242 ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
243 ; GFX11-DENORM-STRICT:       ; %bb.0:
244 ; GFX11-DENORM-STRICT-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
245 ; GFX11-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
246 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
247 ; GFX11-DENORM-STRICT-NEXT:    s_clause 0x2
248 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3]
249 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[4:5]
250 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7]
251 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
252 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
253 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
254 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
255 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
256 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
257 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
258 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
259 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
261 ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
262 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
263 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
264 ; GFX11-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
265 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
266 ; GFX11-DENORM-CONTRACT-NEXT:    s_clause 0x2
267 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3]
268 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[4:5]
269 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7]
270 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
271 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
272 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
273 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
274 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
275 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
276                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
277   %r0 = load half, ptr addrspace(1) %in1
278   %r1 = load half, ptr addrspace(1) %in2
279   %r2 = load half, ptr addrspace(1) %in3
280   %mul = fmul half %r0, %r1
281   %add = fadd half %mul, %r2
282   store half %add, ptr addrspace(1) %out
283   ret void
286 define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
287 ; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
288 ; VI-FLUSH:       ; %bb.0:
289 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
290 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
291 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
292 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
293 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
294 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
295 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
296 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
297 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
298 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
299 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
300 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
301 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
302 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
303 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
304 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
305 ; VI-FLUSH-NEXT:    s_endpgm
307 ; VI-DENORM-LABEL: fmul_fadd_contract_f16:
308 ; VI-DENORM:       ; %bb.0:
309 ; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
310 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
311 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
312 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
313 ; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
314 ; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
315 ; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
316 ; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
317 ; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
318 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
319 ; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
320 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
321 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
322 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
323 ; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
324 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
325 ; VI-DENORM-NEXT:    s_endpgm
327 ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
328 ; GFX10-FLUSH:       ; %bb.0:
329 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
330 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
331 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
332 ; GFX10-FLUSH-NEXT:    s_clause 0x2
333 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
334 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
335 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
336 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
337 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
338 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
339 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
340 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
341 ; GFX10-FLUSH-NEXT:    s_endpgm
343 ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
344 ; GFX10-DENORM:       ; %bb.0:
345 ; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
346 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
347 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
348 ; GFX10-DENORM-NEXT:    s_clause 0x2
349 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
350 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
351 ; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
352 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
353 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
354 ; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
355 ; GFX10-DENORM-NEXT:    s_endpgm
357 ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
358 ; GFX11-FLUSH:       ; %bb.0:
359 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
360 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
361 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
362 ; GFX11-FLUSH-NEXT:    s_clause 0x2
363 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
364 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
365 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
366 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
367 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
368 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
369 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
370 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
371 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
372 ; GFX11-FLUSH-NEXT:    s_nop 0
373 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
374 ; GFX11-FLUSH-NEXT:    s_endpgm
376 ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
377 ; GFX11-DENORM:       ; %bb.0:
378 ; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
379 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
380 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
381 ; GFX11-DENORM-NEXT:    s_clause 0x2
382 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
383 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
384 ; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
385 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
386 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
387 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
388 ; GFX11-DENORM-NEXT:    s_nop 0
389 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
390 ; GFX11-DENORM-NEXT:    s_endpgm
391                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
392   %r0 = load half, ptr addrspace(1) %in1
393   %r1 = load half, ptr addrspace(1) %in2
394   %r2 = load half, ptr addrspace(1) %in3
395   %mul = fmul contract half %r0, %r1
396   %add = fadd contract half %mul, %r2
397   store half %add, ptr addrspace(1) %out
398   ret void
401 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
402 ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
403 ; VI-FLUSH:       ; %bb.0:
404 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
405 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
406 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
407 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
408 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
409 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
410 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
411 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
412 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
413 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
414 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
415 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
416 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
417 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
418 ; VI-FLUSH-NEXT:    s_endpgm
420 ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
421 ; VI-DENORM:       ; %bb.0:
422 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
423 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
424 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
425 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
426 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
427 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
428 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
429 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
430 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
431 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
432 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
433 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
434 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
435 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
436 ; VI-DENORM-NEXT:    s_endpgm
438 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
439 ; GFX10-FLUSH:       ; %bb.0:
440 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
441 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
442 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
443 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
444 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
445 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
446 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
447 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
448 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
449 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
450 ; GFX10-FLUSH-NEXT:    s_endpgm
452 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
453 ; GFX10-DENORM:       ; %bb.0:
454 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
455 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
456 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
457 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
458 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
459 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
460 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
461 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
462 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
463 ; GFX10-DENORM-NEXT:    s_endpgm
465 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
466 ; GFX11-FLUSH:       ; %bb.0:
467 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
468 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
469 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
470 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
471 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
472 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
473 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
474 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
475 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
476 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
477 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
478 ; GFX11-FLUSH-NEXT:    s_nop 0
479 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480 ; GFX11-FLUSH-NEXT:    s_endpgm
482 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
483 ; GFX11-DENORM:       ; %bb.0:
484 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
485 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
486 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
487 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
488 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
489 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
490 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
491 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
492 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
493 ; GFX11-DENORM-NEXT:    s_nop 0
494 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
495 ; GFX11-DENORM-NEXT:    s_endpgm
496   %tid = call i32 @llvm.amdgcn.workitem.id.x()
497   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
498   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
499   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
501   %r1 = load volatile half, ptr addrspace(1) %gep.0
502   %r2 = load volatile half, ptr addrspace(1) %gep.1
504   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
505   store half %r3, ptr addrspace(1) %gep.out
506   ret void
509 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
510 ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
511 ; VI-FLUSH:       ; %bb.0:
512 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
513 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
514 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
515 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
516 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
517 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
518 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
519 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
520 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
521 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
522 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
523 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
524 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
525 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
526 ; VI-FLUSH-NEXT:    s_endpgm
528 ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
529 ; VI-DENORM:       ; %bb.0:
530 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
531 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
532 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
533 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
534 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
535 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
536 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
537 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
538 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
539 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
540 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
541 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
542 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
543 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
544 ; VI-DENORM-NEXT:    s_endpgm
546 ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
547 ; GFX10-FLUSH:       ; %bb.0:
548 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
549 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
550 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
551 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
552 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
553 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
554 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
555 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
556 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
557 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
558 ; GFX10-FLUSH-NEXT:    s_endpgm
560 ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
561 ; GFX10-DENORM:       ; %bb.0:
562 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
563 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
564 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
565 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
566 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
567 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
568 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
569 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
570 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
571 ; GFX10-DENORM-NEXT:    s_endpgm
573 ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
574 ; GFX11-FLUSH:       ; %bb.0:
575 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
576 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
577 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
578 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
579 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
580 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
581 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
582 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
583 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
584 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
585 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
586 ; GFX11-FLUSH-NEXT:    s_nop 0
587 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
588 ; GFX11-FLUSH-NEXT:    s_endpgm
590 ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
591 ; GFX11-DENORM:       ; %bb.0:
592 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
593 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
594 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
595 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
596 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
597 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
598 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
599 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
600 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
601 ; GFX11-DENORM-NEXT:    s_nop 0
602 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
603 ; GFX11-DENORM-NEXT:    s_endpgm
604   %tid = call i32 @llvm.amdgcn.workitem.id.x()
605   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
606   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
607   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
609   %r1 = load volatile half, ptr addrspace(1) %gep.0
610   %r2 = load volatile half, ptr addrspace(1) %gep.1
612   %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
613   store half %r3, ptr addrspace(1) %gep.out
614   ret void
617 define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
618 ; VI-FLUSH-LABEL: fadd_a_a_b_f16:
619 ; VI-FLUSH:       ; %bb.0:
620 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
621 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
622 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
623 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
624 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
625 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
626 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
627 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
628 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
629 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
630 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
631 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
632 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
633 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
634 ; VI-FLUSH-NEXT:    s_endpgm
636 ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
637 ; VI-DENORM-CONTRACT:       ; %bb.0:
638 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
639 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
640 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
641 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
642 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
643 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
644 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
645 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
646 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
647 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
648 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
649 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
650 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
651 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
652 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
654 ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
655 ; GFX10-FLUSH:       ; %bb.0:
656 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
657 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
658 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
659 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
660 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
661 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
662 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
663 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
664 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
665 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
666 ; GFX10-FLUSH-NEXT:    s_endpgm
668 ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
669 ; GFX10-DENORM-STRICT:       ; %bb.0:
670 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
671 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
672 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
673 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
674 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
675 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
676 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
677 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
678 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
679 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
680 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
682 ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
683 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
684 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
685 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
686 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
687 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
688 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
689 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
690 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
691 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
692 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
693 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
695 ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
696 ; GFX11-FLUSH:       ; %bb.0:
697 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
698 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
699 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
700 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
701 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
702 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
703 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
704 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
705 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
706 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
707 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
708 ; GFX11-FLUSH-NEXT:    s_nop 0
709 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
710 ; GFX11-FLUSH-NEXT:    s_endpgm
712 ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
713 ; GFX11-DENORM-STRICT:       ; %bb.0:
714 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
715 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
716 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
717 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
718 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
719 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
720 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
721 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
722 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
723 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
724 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
725 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
726 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
727 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
729 ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
730 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
731 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
732 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
733 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
734 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
735 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
736 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
737 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
738 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
739 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
740 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
741 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
742 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
743                             ptr addrspace(1) %in1,
744                             ptr addrspace(1) %in2) #0 {
745   %tid = call i32 @llvm.amdgcn.workitem.id.x()
746   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
747   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
748   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
750   %r0 = load volatile half, ptr addrspace(1) %gep.0
751   %r1 = load volatile half, ptr addrspace(1) %gep.1
753   %add.0 = fadd half %r0, %r0
754   %add.1 = fadd half %add.0, %r1
755   store half %add.1, ptr addrspace(1) %gep.out
756   ret void
759 define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
760 ; VI-FLUSH-LABEL: fadd_b_a_a_f16:
761 ; VI-FLUSH:       ; %bb.0:
762 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
763 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
764 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
765 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
766 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
767 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
768 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
769 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
770 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
771 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
772 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
773 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
774 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
775 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
776 ; VI-FLUSH-NEXT:    s_endpgm
778 ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
779 ; VI-DENORM-CONTRACT:       ; %bb.0:
780 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
781 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
782 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
783 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
784 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
785 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
786 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
787 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
788 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
789 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
790 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
791 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
792 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
793 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
794 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
796 ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
797 ; GFX10-FLUSH:       ; %bb.0:
798 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
799 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
800 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
801 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
802 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
803 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
804 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
805 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
806 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
807 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
808 ; GFX10-FLUSH-NEXT:    s_endpgm
810 ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
811 ; GFX10-DENORM-STRICT:       ; %bb.0:
812 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
813 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
814 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
815 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
816 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
817 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
818 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
819 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
820 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
821 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
822 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
824 ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
825 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
826 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
827 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
828 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
829 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
830 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
831 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
832 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
833 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
834 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
835 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
837 ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
838 ; GFX11-FLUSH:       ; %bb.0:
839 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
840 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
841 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
842 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
843 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
844 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
845 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
846 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
847 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
848 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
849 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
850 ; GFX11-FLUSH-NEXT:    s_nop 0
851 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
852 ; GFX11-FLUSH-NEXT:    s_endpgm
854 ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
855 ; GFX11-DENORM-STRICT:       ; %bb.0:
856 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
857 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
858 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
859 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
860 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
861 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
862 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
863 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
864 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
865 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
866 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
867 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
868 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
869 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
871 ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
872 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
873 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
874 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
875 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
876 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
877 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
878 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
879 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
880 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
881 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
882 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
883 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
884 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
885                             ptr addrspace(1) %in1,
886                             ptr addrspace(1) %in2) #0 {
887   %tid = call i32 @llvm.amdgcn.workitem.id.x()
888   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
889   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
890   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
892   %r0 = load volatile half, ptr addrspace(1) %gep.0
893   %r1 = load volatile half, ptr addrspace(1) %gep.1
895   %add.0 = fadd half %r0, %r0
896   %add.1 = fadd half %r1, %add.0
897   store half %add.1, ptr addrspace(1) %gep.out
898   ret void
901 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
902 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
903 ; VI-FLUSH:       ; %bb.0:
904 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
905 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
906 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
907 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
908 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
909 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
910 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
911 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
912 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
913 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
914 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
915 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
916 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
917 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
918 ; VI-FLUSH-NEXT:    s_endpgm
920 ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
921 ; VI-DENORM:       ; %bb.0:
922 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
923 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
924 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
925 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
926 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
927 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
928 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
929 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
930 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
931 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
932 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
933 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
934 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
935 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
936 ; VI-DENORM-NEXT:    s_endpgm
938 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
939 ; GFX10-FLUSH:       ; %bb.0:
940 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
941 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
942 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
943 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
944 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
945 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
946 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
947 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
948 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
949 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
950 ; GFX10-FLUSH-NEXT:    s_endpgm
952 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
953 ; GFX10-DENORM:       ; %bb.0:
954 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
955 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
956 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
957 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
958 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
959 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
960 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
961 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
962 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
963 ; GFX10-DENORM-NEXT:    s_endpgm
965 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
966 ; GFX11-FLUSH:       ; %bb.0:
967 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
968 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
969 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
970 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
971 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
972 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
973 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
974 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
975 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
976 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
977 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
978 ; GFX11-FLUSH-NEXT:    s_nop 0
979 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
980 ; GFX11-FLUSH-NEXT:    s_endpgm
982 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
983 ; GFX11-DENORM:       ; %bb.0:
984 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
985 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
986 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
987 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
988 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
989 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
990 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
991 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
992 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
993 ; GFX11-DENORM-NEXT:    s_nop 0
994 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
995 ; GFX11-DENORM-NEXT:    s_endpgm
996   %tid = call i32 @llvm.amdgcn.workitem.id.x()
997   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
998   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
999   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1001   %r1 = load volatile half, ptr addrspace(1) %gep.0
1002   %r2 = load volatile half, ptr addrspace(1) %gep.1
1004   %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
1005   store half %r3, ptr addrspace(1) %gep.out
1006   ret void
1009 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1010 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1011 ; VI-FLUSH:       ; %bb.0:
1012 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1013 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1014 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1015 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1016 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1017 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1018 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1019 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1020 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1021 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1022 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1023 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1024 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
1025 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1026 ; VI-FLUSH-NEXT:    s_endpgm
1028 ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1029 ; VI-DENORM:       ; %bb.0:
1030 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1031 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1032 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1033 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1034 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1035 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1036 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1037 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1038 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1039 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1040 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1041 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1042 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
1043 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1044 ; VI-DENORM-NEXT:    s_endpgm
1046 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1047 ; GFX10-FLUSH:       ; %bb.0:
1048 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1049 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1050 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1051 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1052 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1053 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1054 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1055 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1056 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
1057 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1058 ; GFX10-FLUSH-NEXT:    s_endpgm
1060 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1061 ; GFX10-DENORM:       ; %bb.0:
1062 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1063 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1064 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1065 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1066 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1067 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1068 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1069 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
1070 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
1071 ; GFX10-DENORM-NEXT:    s_endpgm
1073 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1074 ; GFX11-FLUSH:       ; %bb.0:
1075 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1076 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1077 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1078 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1079 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1080 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1081 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1082 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1083 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1084 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
1085 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1086 ; GFX11-FLUSH-NEXT:    s_nop 0
1087 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1088 ; GFX11-FLUSH-NEXT:    s_endpgm
1090 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1091 ; GFX11-DENORM:       ; %bb.0:
1092 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1093 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1094 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1095 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1096 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1097 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1098 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1099 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
1100 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
1101 ; GFX11-DENORM-NEXT:    s_nop 0
1102 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1103 ; GFX11-DENORM-NEXT:    s_endpgm
1104   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1105   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1106   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1107   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1109   %r1 = load volatile half, ptr addrspace(1) %gep.0
1110   %r2 = load volatile half, ptr addrspace(1) %gep.1
1112   %r1.fneg = fneg half %r1
1114   %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
1115   store half %r3, ptr addrspace(1) %gep.out
1116   ret void
1119 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1120 ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1121 ; VI-FLUSH:       ; %bb.0:
1122 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1123 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1124 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1125 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1126 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1127 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1128 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1129 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1130 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1131 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1132 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1133 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1134 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
1135 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1136 ; VI-FLUSH-NEXT:    s_endpgm
1138 ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1139 ; VI-DENORM:       ; %bb.0:
1140 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1141 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1142 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1143 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1144 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1145 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1146 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1147 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1148 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1149 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1150 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1151 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1152 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
1153 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1154 ; VI-DENORM-NEXT:    s_endpgm
1156 ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1157 ; GFX10-FLUSH:       ; %bb.0:
1158 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1159 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1160 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1161 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1162 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1163 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1164 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1165 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1166 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
1167 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1168 ; GFX10-FLUSH-NEXT:    s_endpgm
1170 ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1171 ; GFX10-DENORM:       ; %bb.0:
1172 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1173 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1174 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1175 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1176 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1177 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1178 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1179 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
1180 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
1181 ; GFX10-DENORM-NEXT:    s_endpgm
1183 ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1184 ; GFX11-FLUSH:       ; %bb.0:
1185 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1186 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1187 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1188 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1189 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1190 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1191 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1192 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1193 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1194 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
1195 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1196 ; GFX11-FLUSH-NEXT:    s_nop 0
1197 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1198 ; GFX11-FLUSH-NEXT:    s_endpgm
1200 ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1201 ; GFX11-DENORM:       ; %bb.0:
1202 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1203 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1204 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1205 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1206 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1207 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1208 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1209 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
1210 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
1211 ; GFX11-DENORM-NEXT:    s_nop 0
1212 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1213 ; GFX11-DENORM-NEXT:    s_endpgm
1214   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1215   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1216   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1217   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1219   %r1 = load volatile half, ptr addrspace(1) %gep.0
1220   %r2 = load volatile half, ptr addrspace(1) %gep.1
1222   %r1.fneg = fneg half %r1
1224   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
1225   store half %r3, ptr addrspace(1) %gep.out
1226   ret void
1229 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1230 ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1231 ; VI-FLUSH:       ; %bb.0:
1232 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1233 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1234 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1235 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1236 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1237 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1238 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1239 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1240 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1241 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1242 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1243 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1244 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
1245 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1246 ; VI-FLUSH-NEXT:    s_endpgm
1248 ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1249 ; VI-DENORM:       ; %bb.0:
1250 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1251 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1252 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1253 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1254 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1255 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1256 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1257 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1258 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1259 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1260 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1261 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1262 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
1263 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1264 ; VI-DENORM-NEXT:    s_endpgm
1266 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1267 ; GFX10-FLUSH:       ; %bb.0:
1268 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1269 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1270 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1271 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1272 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1273 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1274 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1275 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1276 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
1277 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1278 ; GFX10-FLUSH-NEXT:    s_endpgm
1280 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1281 ; GFX10-DENORM:       ; %bb.0:
1282 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1283 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1284 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1285 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1286 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1287 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1288 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1289 ; GFX10-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
1290 ; GFX10-DENORM-NEXT:    global_store_short v0, v1, s[0:1]
1291 ; GFX10-DENORM-NEXT:    s_endpgm
1293 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1294 ; GFX11-FLUSH:       ; %bb.0:
1295 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1296 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1297 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1298 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1299 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1300 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1301 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1302 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1303 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1304 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
1305 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1306 ; GFX11-FLUSH-NEXT:    s_nop 0
1307 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1308 ; GFX11-FLUSH-NEXT:    s_endpgm
1310 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1311 ; GFX11-DENORM:       ; %bb.0:
1312 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1313 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1314 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1315 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1316 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1317 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1318 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1319 ; GFX11-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
1320 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1]
1321 ; GFX11-DENORM-NEXT:    s_nop 0
1322 ; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1323 ; GFX11-DENORM-NEXT:    s_endpgm
1324   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1325   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1326   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1327   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1329   %r1 = load volatile half, ptr addrspace(1) %gep.0
1330   %r2 = load volatile half, ptr addrspace(1) %gep.1
1332   %r2.fneg = fneg half %r2
1334   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
1335   store half %r3, ptr addrspace(1) %gep.out
1336   ret void
1339 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1340 ; VI-FLUSH-LABEL: mad_sub_f16:
1341 ; VI-FLUSH:       ; %bb.0:
1342 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1343 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1344 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1345 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1346 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1347 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1348 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1349 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1350 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1351 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1352 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1353 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1354 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1355 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1356 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1357 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1358 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1359 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1360 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1361 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -v3
1362 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1363 ; VI-FLUSH-NEXT:    s_endpgm
1365 ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
1366 ; VI-DENORM-CONTRACT:       ; %bb.0:
1367 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1368 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1369 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1370 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1371 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1372 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1373 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1374 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1375 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1376 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1377 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1378 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1379 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1380 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1381 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1382 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1383 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1384 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1385 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1386 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -v3
1387 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1388 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1390 ; GFX10-FLUSH-LABEL: mad_sub_f16:
1391 ; GFX10-FLUSH:       ; %bb.0:
1392 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1393 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1394 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1395 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1396 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1397 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1398 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1399 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1400 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1401 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1402 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
1403 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1404 ; GFX10-FLUSH-NEXT:    s_endpgm
1406 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
1407 ; GFX10-DENORM-STRICT:       ; %bb.0:
1408 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1409 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1410 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1411 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1412 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1413 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1414 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1415 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1416 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1417 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1418 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
1419 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1420 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1422 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
1423 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1424 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1425 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1426 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1427 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1428 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1429 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1430 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1431 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1432 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1433 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
1434 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1435 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1437 ; GFX11-FLUSH-LABEL: mad_sub_f16:
1438 ; GFX11-FLUSH:       ; %bb.0:
1439 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1440 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1441 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1442 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1443 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1444 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1445 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1446 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1447 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1448 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1449 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1450 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
1451 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1452 ; GFX11-FLUSH-NEXT:    s_nop 0
1453 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1454 ; GFX11-FLUSH-NEXT:    s_endpgm
1456 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
1457 ; GFX11-DENORM-STRICT:       ; %bb.0:
1458 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1459 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1460 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1461 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1462 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1463 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1464 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1465 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1466 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1467 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1468 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1469 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
1470 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1471 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
1472 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1473 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1475 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
1476 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1477 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1478 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1479 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1480 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1481 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1482 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1483 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1484 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1485 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1486 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
1487 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1488 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
1489 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1490 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1491   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1492   %tid.ext = sext i32 %tid to i64
1493   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1494   %add1 = add i64 %tid.ext, 1
1495   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1496   %add2 = add i64 %tid.ext, 2
1497   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1498   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1499   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1500   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1501   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1502   %mul = fmul half %a, %b
1503   %sub = fsub half %mul, %c
1504   store half %sub, ptr addrspace(1) %outgep, align 2
1505   ret void
1508 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1509 ; VI-FLUSH-LABEL: mad_sub_inv_f16:
1510 ; VI-FLUSH:       ; %bb.0:
1511 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1512 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1513 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1514 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1515 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1516 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1517 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1518 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1519 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1520 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1521 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1522 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1523 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1524 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1525 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1526 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1527 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1528 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1529 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1530 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, v3
1531 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1532 ; VI-FLUSH-NEXT:    s_endpgm
1534 ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1535 ; VI-DENORM-CONTRACT:       ; %bb.0:
1536 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1537 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1538 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1539 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1540 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1541 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1542 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1543 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1544 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1545 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1546 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1547 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1548 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1549 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1550 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1551 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1552 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1553 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1554 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1555 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, v3
1556 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1557 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1559 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
1560 ; GFX10-FLUSH:       ; %bb.0:
1561 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1562 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1563 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1564 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1565 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1566 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1567 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1568 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1569 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1570 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1571 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
1572 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1573 ; GFX10-FLUSH-NEXT:    s_endpgm
1575 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1576 ; GFX10-DENORM-STRICT:       ; %bb.0:
1577 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1578 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1579 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1580 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1581 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1582 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1583 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1584 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1585 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1586 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1587 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
1588 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1589 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1591 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1592 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1593 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1594 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1595 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1596 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1597 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1598 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1599 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1600 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1601 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1602 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
1603 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1604 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1606 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
1607 ; GFX11-FLUSH:       ; %bb.0:
1608 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1609 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1610 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1611 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1612 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1613 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1614 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1615 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1616 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1617 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1618 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1619 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
1620 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1621 ; GFX11-FLUSH-NEXT:    s_nop 0
1622 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1623 ; GFX11-FLUSH-NEXT:    s_endpgm
1625 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1626 ; GFX11-DENORM-STRICT:       ; %bb.0:
1627 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1628 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1629 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1630 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1631 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1632 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1633 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1634 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1635 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1636 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1637 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1638 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
1639 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1640 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
1641 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1642 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1644 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1645 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1646 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1647 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1648 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1649 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1650 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1651 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1652 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1653 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1654 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1655 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
1656 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1657 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
1658 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1659 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1660   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1661   %tid.ext = sext i32 %tid to i64
1662   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1663   %add1 = add i64 %tid.ext, 1
1664   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1665   %add2 = add i64 %tid.ext, 2
1666   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1667   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1668   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1669   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1670   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1671   %mul = fmul half %a, %b
1672   %sub = fsub half %c, %mul
1673   store half %sub, ptr addrspace(1) %outgep, align 2
1674   ret void
1677 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1678 ; VI-FLUSH-LABEL: mad_sub_fabs_f16:
1679 ; VI-FLUSH:       ; %bb.0:
1680 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1681 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1682 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1683 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1684 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1685 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1686 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1687 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1688 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1689 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1690 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1691 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1692 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1693 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1694 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1695 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1696 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1697 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1698 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1699 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -|v3|
1700 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1701 ; VI-FLUSH-NEXT:    s_endpgm
1703 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1704 ; VI-DENORM-CONTRACT:       ; %bb.0:
1705 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1706 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1707 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1708 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1709 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1710 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1711 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1712 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1713 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1714 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1715 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1716 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1717 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1718 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1719 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1720 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1721 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1722 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1723 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1724 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -|v3|
1725 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1726 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1728 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
1729 ; GFX10-FLUSH:       ; %bb.0:
1730 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1731 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1732 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1733 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1734 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1735 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1736 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1737 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1738 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1739 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1740 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1741 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1742 ; GFX10-FLUSH-NEXT:    s_endpgm
1744 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1745 ; GFX10-DENORM-STRICT:       ; %bb.0:
1746 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1747 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1748 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1749 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1750 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1751 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1752 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1753 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1754 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1755 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1756 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1757 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1758 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1760 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1761 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1762 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1763 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1764 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1765 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1766 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1767 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1768 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1769 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1770 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1771 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
1772 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1773 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1775 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
1776 ; GFX11-FLUSH:       ; %bb.0:
1777 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1778 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1779 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1780 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1781 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1782 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1783 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1784 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1785 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1786 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1787 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1788 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1789 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1790 ; GFX11-FLUSH-NEXT:    s_nop 0
1791 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1792 ; GFX11-FLUSH-NEXT:    s_endpgm
1794 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1795 ; GFX11-DENORM-STRICT:       ; %bb.0:
1796 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1797 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1798 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1799 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1800 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1801 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1802 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1803 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1804 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1805 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1806 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1807 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1808 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1809 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
1810 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1811 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1813 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1814 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1815 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1816 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1817 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1818 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1819 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1820 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1821 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1822 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1823 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1824 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
1825 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1826 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
1827 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1828 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1829   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1830   %tid.ext = sext i32 %tid to i64
1831   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1832   %add1 = add i64 %tid.ext, 1
1833   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1834   %add2 = add i64 %tid.ext, 2
1835   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1836   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1837   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1838   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1839   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1840   %c.abs = call half @llvm.fabs.f16(half %c) #0
1841   %mul = fmul half %a, %b
1842   %sub = fsub half %mul, %c.abs
1843   store half %sub, ptr addrspace(1) %outgep, align 2
1844   ret void
1847 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1848 ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1849 ; VI-FLUSH:       ; %bb.0:
1850 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1851 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1852 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1853 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1854 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1855 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1856 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1857 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1858 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1859 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1860 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1861 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1862 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1863 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1864 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1865 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1866 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1867 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1868 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1869 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, |v3|
1870 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1871 ; VI-FLUSH-NEXT:    s_endpgm
1873 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1874 ; VI-DENORM-CONTRACT:       ; %bb.0:
1875 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1876 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1877 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1878 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1879 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1880 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1881 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1882 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1883 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1884 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1885 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1886 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1887 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1888 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1889 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1890 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1891 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1892 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1893 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1894 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, |v3|
1895 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1896 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1898 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1899 ; GFX10-FLUSH:       ; %bb.0:
1900 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1901 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1902 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1903 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1904 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1905 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1906 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1907 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1908 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1909 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1910 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1911 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1912 ; GFX10-FLUSH-NEXT:    s_endpgm
1914 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
1915 ; GFX10-DENORM-STRICT:       ; %bb.0:
1916 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1917 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1918 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1919 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1920 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1921 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1922 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1923 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1924 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1925 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1926 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1927 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1928 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1930 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1931 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1932 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1933 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1934 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1935 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1936 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1937 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1938 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1939 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1940 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1941 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
1942 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1943 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1945 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1946 ; GFX11-FLUSH:       ; %bb.0:
1947 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1948 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1949 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1950 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1951 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1952 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1953 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1954 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1955 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1956 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1957 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1958 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1959 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1960 ; GFX11-FLUSH-NEXT:    s_nop 0
1961 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1962 ; GFX11-FLUSH-NEXT:    s_endpgm
1964 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
1965 ; GFX11-DENORM-STRICT:       ; %bb.0:
1966 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1967 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1968 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1969 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1970 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1971 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1972 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1973 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1974 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1975 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1976 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1977 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1978 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1979 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
1980 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1981 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1983 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1984 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1985 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1986 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1987 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1988 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1989 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1990 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1991 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1992 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1993 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1994 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
1995 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1996 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
1997 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1998 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1999   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2000   %tid.ext = sext i32 %tid to i64
2001   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2002   %add1 = add i64 %tid.ext, 1
2003   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2004   %add2 = add i64 %tid.ext, 2
2005   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2006   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2007   %a = load volatile half, ptr addrspace(1) %gep0, align 2
2008   %b = load volatile half, ptr addrspace(1) %gep1, align 2
2009   %c = load volatile half, ptr addrspace(1) %gep2, align 2
2010   %c.abs = call half @llvm.fabs.f16(half %c) #0
2011   %mul = fmul half %a, %b
2012   %sub = fsub half %c.abs, %mul
2013   store half %sub, ptr addrspace(1) %outgep, align 2
2014   ret void
2017 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2018 ; VI-FLUSH-LABEL: neg_neg_mad_f16:
2019 ; VI-FLUSH:       ; %bb.0:
2020 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2021 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2022 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2023 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
2024 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2025 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2026 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2027 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2028 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2029 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2030 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
2031 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2032 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2033 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2034 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
2035 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2036 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2037 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2038 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2039 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v7, v2
2040 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
2041 ; VI-FLUSH-NEXT:    s_endpgm
2043 ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2044 ; VI-DENORM-CONTRACT:       ; %bb.0:
2045 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2046 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2047 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2048 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
2049 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2050 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2051 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2052 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2053 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2054 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2055 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
2056 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2057 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2058 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2059 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
2060 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2061 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2062 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2063 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2064 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, v3
2065 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2066 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2068 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
2069 ; GFX10-FLUSH:       ; %bb.0:
2070 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2071 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2072 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2073 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2074 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2075 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2076 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2077 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2078 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2079 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
2080 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
2081 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2082 ; GFX10-FLUSH-NEXT:    s_endpgm
2084 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2085 ; GFX10-DENORM-STRICT:       ; %bb.0:
2086 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2087 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2088 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2089 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2090 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2091 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2092 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2093 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2094 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2095 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
2096 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
2097 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2098 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2100 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2101 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2102 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2103 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2104 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2105 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2106 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2107 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2108 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2109 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2110 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2111 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
2112 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
2113 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2115 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
2116 ; GFX11-FLUSH:       ; %bb.0:
2117 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2118 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2119 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2120 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2121 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2122 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2123 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2124 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2125 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2126 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
2127 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2128 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
2129 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2130 ; GFX11-FLUSH-NEXT:    s_nop 0
2131 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2132 ; GFX11-FLUSH-NEXT:    s_endpgm
2134 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2135 ; GFX11-DENORM-STRICT:       ; %bb.0:
2136 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2137 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2138 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2139 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2140 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2141 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2142 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2143 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2144 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2145 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
2146 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2147 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
2148 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2149 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
2150 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2151 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2153 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2154 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2155 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2156 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2157 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2158 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2159 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2160 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2161 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2162 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2163 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2164 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
2165 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
2166 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
2167 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2168 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2169   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2170   %tid.ext = sext i32 %tid to i64
2171   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2172   %add1 = add i64 %tid.ext, 1
2173   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2174   %add2 = add i64 %tid.ext, 2
2175   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2176   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2177   %a = load volatile half, ptr addrspace(1) %gep0, align 2
2178   %b = load volatile half, ptr addrspace(1) %gep1, align 2
2179   %c = load volatile half, ptr addrspace(1) %gep2, align 2
2180   %nega = fneg half %a
2181   %negb = fneg half %b
2182   %mul = fmul half %nega, %negb
2183   %sub = fadd half %mul, %c
2184   store half %sub, ptr addrspace(1) %outgep, align 2
2185   ret void
2188 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2189 ; VI-FLUSH-LABEL: mad_fabs_sub_f16:
2190 ; VI-FLUSH:       ; %bb.0:
2191 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2192 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2193 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2194 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
2195 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2196 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2197 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2198 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2199 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2200 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2201 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
2202 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2203 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2204 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2205 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
2206 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2207 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2208 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2209 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2210 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, |v2|, -v3
2211 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2212 ; VI-FLUSH-NEXT:    s_endpgm
2214 ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2215 ; VI-DENORM-CONTRACT:       ; %bb.0:
2216 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2217 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2218 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2219 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
2220 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2221 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2222 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2223 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2224 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2225 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2226 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
2227 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2228 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2229 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2230 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
2231 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2232 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2233 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2234 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2235 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, |v2|, -v3
2236 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2237 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2239 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
2240 ; GFX10-FLUSH:       ; %bb.0:
2241 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2242 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2243 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2244 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2245 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2246 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2247 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2248 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2249 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2250 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2251 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
2252 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2253 ; GFX10-FLUSH-NEXT:    s_endpgm
2255 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2256 ; GFX10-DENORM-STRICT:       ; %bb.0:
2257 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2258 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2259 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2260 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2261 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2262 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2263 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2264 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2265 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2266 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2267 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
2268 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2269 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2271 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2272 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2273 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2274 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2275 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2276 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2277 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2278 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2279 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2280 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2281 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2282 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
2283 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
2284 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2286 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
2287 ; GFX11-FLUSH:       ; %bb.0:
2288 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2289 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2290 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2291 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2292 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2293 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2294 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2295 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2296 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2297 ; GFX11-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2298 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2299 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
2300 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2301 ; GFX11-FLUSH-NEXT:    s_nop 0
2302 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2303 ; GFX11-FLUSH-NEXT:    s_endpgm
2305 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2306 ; GFX11-DENORM-STRICT:       ; %bb.0:
2307 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2308 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2309 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2310 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2311 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2312 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2313 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2314 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2315 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2316 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2317 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2318 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
2319 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2320 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
2321 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2322 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2324 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2325 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2326 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2327 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2328 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2329 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2330 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2331 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2332 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2333 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2334 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2335 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
2336 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
2337 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
2338 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2339 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2340   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2341   %tid.ext = sext i32 %tid to i64
2342   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2343   %add1 = add i64 %tid.ext, 1
2344   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2345   %add2 = add i64 %tid.ext, 2
2346   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2347   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2348   %a = load volatile half, ptr addrspace(1) %gep0, align 2
2349   %b = load volatile half, ptr addrspace(1) %gep1, align 2
2350   %c = load volatile half, ptr addrspace(1) %gep2, align 2
2351   %b.abs = call half @llvm.fabs.f16(half %b) #0
2352   %mul = fmul half %a, %b.abs
2353   %sub = fsub half %mul, %c
2354   store half %sub, ptr addrspace(1) %outgep, align 2
2355   ret void
2358 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2359 ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2360 ; VI-FLUSH:       ; %bb.0:
2361 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2362 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2363 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2364 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2365 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2366 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2367 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2368 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2369 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
2370 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2371 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2372 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2373 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
2374 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2375 ; VI-FLUSH-NEXT:    s_endpgm
2377 ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2378 ; VI-DENORM-CONTRACT:       ; %bb.0:
2379 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2380 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2381 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2382 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2383 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2384 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2385 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2386 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2387 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
2388 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2389 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2390 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2391 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, -2.0, v2
2392 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2393 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2395 ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2396 ; GFX10-FLUSH:       ; %bb.0:
2397 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2398 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2399 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2400 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2401 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2402 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2403 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2404 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2405 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
2406 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2407 ; GFX10-FLUSH-NEXT:    s_endpgm
2409 ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2410 ; GFX10-DENORM-STRICT:       ; %bb.0:
2411 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2412 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2413 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2414 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2415 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2416 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2417 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2418 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2419 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
2420 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2421 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2423 ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2424 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2425 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2426 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2427 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2428 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2429 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2430 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2431 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2432 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
2433 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
2434 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2436 ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2437 ; GFX11-FLUSH:       ; %bb.0:
2438 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2439 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2440 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2441 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2442 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2443 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2444 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2445 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2446 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2447 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
2448 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2449 ; GFX11-FLUSH-NEXT:    s_nop 0
2450 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2451 ; GFX11-FLUSH-NEXT:    s_endpgm
2453 ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2454 ; GFX11-DENORM-STRICT:       ; %bb.0:
2455 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2456 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2457 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2458 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2459 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2460 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2461 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2462 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2463 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2464 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
2465 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2466 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
2467 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2468 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2470 ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2471 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2472 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2473 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2474 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2475 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2476 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2477 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2478 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2479 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
2480 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
2481 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
2482 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2483 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2484   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2485   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2486   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2487   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2489   %r1 = load volatile half, ptr addrspace(1) %gep.0
2490   %r2 = load volatile half, ptr addrspace(1) %gep.1
2492   %add = fadd half %r1, %r1
2493   %r3 = fsub half %r2, %add
2495   store half %r3, ptr addrspace(1) %gep.out
2496   ret void
2499 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2500 ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2501 ; VI-FLUSH:       ; %bb.0:
2502 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2503 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2504 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2505 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2506 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2507 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2508 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2509 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2510 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
2511 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2512 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2513 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2514 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
2515 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2516 ; VI-FLUSH-NEXT:    s_endpgm
2518 ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2519 ; VI-DENORM-CONTRACT:       ; %bb.0:
2520 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2521 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2522 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2523 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2524 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2525 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2526 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2527 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2528 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
2529 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2530 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2531 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2532 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
2533 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2534 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2536 ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2537 ; GFX10-FLUSH:       ; %bb.0:
2538 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2539 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2540 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2541 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2542 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2543 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2544 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2545 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2546 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
2547 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2548 ; GFX10-FLUSH-NEXT:    s_endpgm
2550 ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2551 ; GFX10-DENORM-STRICT:       ; %bb.0:
2552 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2553 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2554 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2555 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2556 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2557 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2558 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2559 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2560 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
2561 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2562 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2564 ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2565 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2566 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2567 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2568 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2569 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2570 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2571 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2572 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2573 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
2574 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
2575 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2577 ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2578 ; GFX11-FLUSH:       ; %bb.0:
2579 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2580 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2581 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2582 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2583 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2584 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2585 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2586 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2587 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2588 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
2589 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2590 ; GFX11-FLUSH-NEXT:    s_nop 0
2591 ; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2592 ; GFX11-FLUSH-NEXT:    s_endpgm
2594 ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2595 ; GFX11-DENORM-STRICT:       ; %bb.0:
2596 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2597 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2598 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2599 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2600 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2601 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2602 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2603 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2604 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2605 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
2606 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2607 ; GFX11-DENORM-STRICT-NEXT:    s_nop 0
2608 ; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2609 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2611 ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2612 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2613 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2614 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2615 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2616 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2617 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2618 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2619 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2620 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
2621 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
2622 ; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
2623 ; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2624 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2625   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2626   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2627   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2628   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2630   %r1 = load volatile half, ptr addrspace(1) %gep.0
2631   %r2 = load volatile half, ptr addrspace(1) %gep.1
2633   %add = fadd half %r1, %r1
2634   %r3 = fsub half %add, %r2
2636   store half %r3, ptr addrspace(1) %gep.out
2637   ret void
2640 attributes #0 = { nounwind }
2641 attributes #1 = { nounwind readnone }