ELF: Have __rela_iplt_{start,end} surround .rela.iplt with --pack-dyn-relocs=android.
[llvm-project.git] / llvm / test / CodeGen / AMDGPU / fmuladd.f16.ll
bloba753e38b04abf4891b07fac910071d1aa74c22c8
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
10 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
11 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
13 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
14 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
15 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s
16 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s
18 declare i32 @llvm.amdgcn.workitem.id.x() #1
19 declare half @llvm.fmuladd.f16(half, half, half) #1
20 declare half @llvm.fabs.f16(half) #1
22 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
23 ; VI-FLUSH-LABEL: fmuladd_f16:
24 ; VI-FLUSH:       ; %bb.0:
25 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
26 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
27 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
28 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
29 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
30 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
31 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
32 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
33 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
34 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
35 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
36 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
37 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
38 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
39 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
40 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
41 ; VI-FLUSH-NEXT:    s_endpgm
43 ; VI-DENORM-LABEL: fmuladd_f16:
44 ; VI-DENORM:       ; %bb.0:
45 ; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
46 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
47 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
48 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
49 ; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
50 ; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
51 ; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
52 ; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
53 ; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
54 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
55 ; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
56 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
57 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
58 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
59 ; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
60 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
61 ; VI-DENORM-NEXT:    s_endpgm
63 ; GFX10-FLUSH-LABEL: fmuladd_f16:
64 ; GFX10-FLUSH:       ; %bb.0:
65 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
66 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
67 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
68 ; GFX10-FLUSH-NEXT:    s_clause 0x2
69 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[10:11]
70 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[12:13]
71 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[14:15]
72 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
73 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
74 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
75 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
76 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[8:9]
77 ; GFX10-FLUSH-NEXT:    s_endpgm
79 ; GFX10-DENORM-LABEL: fmuladd_f16:
80 ; GFX10-DENORM:       ; %bb.0:
81 ; GFX10-DENORM-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
82 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
83 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
84 ; GFX10-DENORM-NEXT:    s_clause 0x2
85 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[10:11]
86 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[12:13]
87 ; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[14:15]
88 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
89 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
90 ; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[8:9]
91 ; GFX10-DENORM-NEXT:    s_endpgm
93 ; GFX11-FLUSH-LABEL: fmuladd_f16:
94 ; GFX11-FLUSH:       ; %bb.0:
95 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
96 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
97 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
98 ; GFX11-FLUSH-NEXT:    s_clause 0x2
99 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
100 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
101 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
102 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
103 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
104 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
105 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
106 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
107 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
108 ; GFX11-FLUSH-NEXT:    s_endpgm
110 ; GFX11-DENORM-LABEL: fmuladd_f16:
111 ; GFX11-DENORM:       ; %bb.0:
112 ; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
113 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
114 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
115 ; GFX11-DENORM-NEXT:    s_clause 0x2
116 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
117 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
118 ; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
119 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
120 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
121 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
122 ; GFX11-DENORM-NEXT:    s_endpgm
123                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
124   %r0 = load half, ptr addrspace(1) %in1
125   %r1 = load half, ptr addrspace(1) %in2
126   %r2 = load half, ptr addrspace(1) %in3
127   %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
128   store half %r3, ptr addrspace(1) %out
129   ret void
132 define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
133 ; VI-FLUSH-LABEL: fmul_fadd_f16:
134 ; VI-FLUSH:       ; %bb.0:
135 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
136 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
137 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
138 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
139 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
140 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
141 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
142 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
143 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
144 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
145 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
146 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
147 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
148 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
149 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
150 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
151 ; VI-FLUSH-NEXT:    s_endpgm
153 ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
154 ; VI-DENORM-CONTRACT:       ; %bb.0:
155 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
156 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
157 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s2
158 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
159 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v2, s4
160 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v3, s5
161 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v4, s6
162 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v5, s7
163 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v6, v[0:1]
164 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3]
165 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5]
166 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s0
167 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
168 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
169 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v6, v2, v3
170 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
171 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
173 ; GFX10-FLUSH-LABEL: fmul_fadd_f16:
174 ; GFX10-FLUSH:       ; %bb.0:
175 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
176 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
177 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
178 ; GFX10-FLUSH-NEXT:    s_clause 0x2
179 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[10:11]
180 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[12:13]
181 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[14:15]
182 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
183 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
184 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
185 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
186 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[8:9]
187 ; GFX10-FLUSH-NEXT:    s_endpgm
189 ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
190 ; GFX10-DENORM-STRICT:       ; %bb.0:
191 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
192 ; GFX10-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
193 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
194 ; GFX10-DENORM-STRICT-NEXT:    s_clause 0x2
195 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[10:11]
196 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[12:13]
197 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[14:15]
198 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
199 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
200 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
201 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
202 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[8:9]
203 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
205 ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
206 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
207 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
208 ; GFX10-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
209 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
210 ; GFX10-DENORM-CONTRACT-NEXT:    s_clause 0x2
211 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[10:11]
212 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[12:13]
213 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[14:15]
214 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
215 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
216 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[8:9]
217 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
219 ; GFX11-FLUSH-LABEL: fmul_fadd_f16:
220 ; GFX11-FLUSH:       ; %bb.0:
221 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
222 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
223 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
224 ; GFX11-FLUSH-NEXT:    s_clause 0x2
225 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
226 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
227 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
228 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
229 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
230 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
231 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
232 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
233 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
234 ; GFX11-FLUSH-NEXT:    s_endpgm
236 ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
237 ; GFX11-DENORM-STRICT:       ; %bb.0:
238 ; GFX11-DENORM-STRICT-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
239 ; GFX11-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
240 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
241 ; GFX11-DENORM-STRICT-NEXT:    s_clause 0x2
242 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3]
243 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[4:5]
244 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7]
245 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
246 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
247 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
248 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
249 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
250 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
251 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
253 ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
254 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
255 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
256 ; GFX11-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
257 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
258 ; GFX11-DENORM-CONTRACT-NEXT:    s_clause 0x2
259 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3]
260 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[4:5]
261 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7]
262 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
263 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
264 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
265 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
266                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
267   %r0 = load half, ptr addrspace(1) %in1
268   %r1 = load half, ptr addrspace(1) %in2
269   %r2 = load half, ptr addrspace(1) %in3
270   %mul = fmul half %r0, %r1
271   %add = fadd half %mul, %r2
272   store half %add, ptr addrspace(1) %out
273   ret void
276 define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
277 ; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
278 ; VI-FLUSH:       ; %bb.0:
279 ; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
280 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
281 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
282 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
283 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
284 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
285 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
286 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
287 ; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
288 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
289 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
290 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
291 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
292 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
293 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
294 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
295 ; VI-FLUSH-NEXT:    s_endpgm
297 ; VI-DENORM-LABEL: fmul_fadd_contract_f16:
298 ; VI-DENORM:       ; %bb.0:
299 ; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
300 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
301 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
302 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
303 ; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
304 ; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
305 ; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
306 ; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
307 ; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
308 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
309 ; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
310 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
311 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
312 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
313 ; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
314 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
315 ; VI-DENORM-NEXT:    s_endpgm
317 ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
318 ; GFX10-FLUSH:       ; %bb.0:
319 ; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
320 ; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
321 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
322 ; GFX10-FLUSH-NEXT:    s_clause 0x2
323 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[10:11]
324 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[12:13]
325 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[14:15]
326 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
327 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
328 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
329 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
330 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[8:9]
331 ; GFX10-FLUSH-NEXT:    s_endpgm
333 ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
334 ; GFX10-DENORM:       ; %bb.0:
335 ; GFX10-DENORM-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
336 ; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
337 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
338 ; GFX10-DENORM-NEXT:    s_clause 0x2
339 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[10:11]
340 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[12:13]
341 ; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[14:15]
342 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
343 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
344 ; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[8:9]
345 ; GFX10-DENORM-NEXT:    s_endpgm
347 ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
348 ; GFX11-FLUSH:       ; %bb.0:
349 ; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
350 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
351 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
352 ; GFX11-FLUSH-NEXT:    s_clause 0x2
353 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
354 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
355 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
356 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
357 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
358 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
359 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
360 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
361 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
362 ; GFX11-FLUSH-NEXT:    s_endpgm
364 ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
365 ; GFX11-DENORM:       ; %bb.0:
366 ; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
367 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
368 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
369 ; GFX11-DENORM-NEXT:    s_clause 0x2
370 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
371 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
372 ; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
373 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
374 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
375 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
376 ; GFX11-DENORM-NEXT:    s_endpgm
377                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
378   %r0 = load half, ptr addrspace(1) %in1
379   %r1 = load half, ptr addrspace(1) %in2
380   %r2 = load half, ptr addrspace(1) %in3
381   %mul = fmul contract half %r0, %r1
382   %add = fadd contract half %mul, %r2
383   store half %add, ptr addrspace(1) %out
384   ret void
387 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
388 ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
389 ; VI-FLUSH:       ; %bb.0:
390 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
391 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
392 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
393 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
394 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
395 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
396 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
397 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
398 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
399 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
400 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
401 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
402 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
403 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
404 ; VI-FLUSH-NEXT:    s_endpgm
406 ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
407 ; VI-DENORM:       ; %bb.0:
408 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
409 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
410 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
411 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
412 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
413 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
414 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
415 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
416 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
417 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
418 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
419 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
420 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
421 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
422 ; VI-DENORM-NEXT:    s_endpgm
424 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
425 ; GFX10-FLUSH:       ; %bb.0:
426 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
427 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
428 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
429 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
430 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
431 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
432 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
433 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
434 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
435 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
436 ; GFX10-FLUSH-NEXT:    s_endpgm
438 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
439 ; GFX10-DENORM:       ; %bb.0:
440 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
441 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
442 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
443 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
444 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
445 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
446 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
447 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
448 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
449 ; GFX10-DENORM-NEXT:    s_endpgm
451 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
452 ; GFX11-FLUSH:       ; %bb.0:
453 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
454 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
455 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
456 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
457 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
458 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
459 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
460 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
461 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
462 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
463 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
464 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
465 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
466 ; GFX11-FLUSH-NEXT:    s_endpgm
468 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
469 ; GFX11-DENORM:       ; %bb.0:
470 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
471 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
472 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
473 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
474 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
475 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
476 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
477 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
478 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
479 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
480 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
481 ; GFX11-DENORM-NEXT:    s_endpgm
482   %tid = call i32 @llvm.amdgcn.workitem.id.x()
483   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
484   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
485   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
487   %r1 = load volatile half, ptr addrspace(1) %gep.0
488   %r2 = load volatile half, ptr addrspace(1) %gep.1
490   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
491   store half %r3, ptr addrspace(1) %gep.out
492   ret void
495 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
496 ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
497 ; VI-FLUSH:       ; %bb.0:
498 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
499 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
500 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
501 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
502 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
503 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
504 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
505 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
506 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
507 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
508 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
509 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
510 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
511 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
512 ; VI-FLUSH-NEXT:    s_endpgm
514 ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
515 ; VI-DENORM:       ; %bb.0:
516 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
517 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
518 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
519 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
520 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
521 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
522 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
523 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
524 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
525 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
526 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
527 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
528 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
529 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
530 ; VI-DENORM-NEXT:    s_endpgm
532 ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
533 ; GFX10-FLUSH:       ; %bb.0:
534 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
535 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
536 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
537 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
538 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
539 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
540 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
541 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
542 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
543 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
544 ; GFX10-FLUSH-NEXT:    s_endpgm
546 ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
547 ; GFX10-DENORM:       ; %bb.0:
548 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
549 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
550 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
551 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
552 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
553 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
554 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
555 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
556 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
557 ; GFX10-DENORM-NEXT:    s_endpgm
559 ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
560 ; GFX11-FLUSH:       ; %bb.0:
561 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
562 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
563 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
564 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
565 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
566 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
567 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
568 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
569 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
570 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
571 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
572 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
573 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
574 ; GFX11-FLUSH-NEXT:    s_endpgm
576 ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
577 ; GFX11-DENORM:       ; %bb.0:
578 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
579 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
580 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
581 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
582 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
583 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
584 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
585 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
586 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
587 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
588 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
589 ; GFX11-DENORM-NEXT:    s_endpgm
590   %tid = call i32 @llvm.amdgcn.workitem.id.x()
591   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
592   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
593   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
595   %r1 = load volatile half, ptr addrspace(1) %gep.0
596   %r2 = load volatile half, ptr addrspace(1) %gep.1
598   %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
599   store half %r3, ptr addrspace(1) %gep.out
600   ret void
603 define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
604 ; VI-FLUSH-LABEL: fadd_a_a_b_f16:
605 ; VI-FLUSH:       ; %bb.0:
606 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
607 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
608 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
609 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
610 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
611 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
612 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
613 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
614 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
615 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
616 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
617 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
618 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
619 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
620 ; VI-FLUSH-NEXT:    s_endpgm
622 ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
623 ; VI-DENORM-CONTRACT:       ; %bb.0:
624 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
625 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
626 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
627 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
628 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
629 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
630 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
631 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
632 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
633 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
634 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
635 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
636 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
637 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
638 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
640 ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
641 ; GFX10-FLUSH:       ; %bb.0:
642 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
643 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
644 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
645 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
646 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
647 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
648 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
649 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
650 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
651 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
652 ; GFX10-FLUSH-NEXT:    s_endpgm
654 ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
655 ; GFX10-DENORM-STRICT:       ; %bb.0:
656 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
657 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
658 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
659 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
660 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
661 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
662 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
663 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
664 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
665 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
666 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
668 ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
669 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
670 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
671 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
672 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
673 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
674 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
675 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
676 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
677 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
678 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
679 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
681 ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
682 ; GFX11-FLUSH:       ; %bb.0:
683 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
684 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
685 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
686 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
687 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
688 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
689 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
690 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
691 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
692 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
693 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
694 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
695 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
696 ; GFX11-FLUSH-NEXT:    s_endpgm
698 ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
699 ; GFX11-DENORM-STRICT:       ; %bb.0:
700 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
701 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
702 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
703 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
704 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
705 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
706 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
707 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
708 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
709 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
710 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
711 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
712 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
713 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
715 ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
716 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
717 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
718 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
719 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
720 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
721 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
722 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
723 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
724 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
725 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
726 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
727 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
728 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
729                             ptr addrspace(1) %in1,
730                             ptr addrspace(1) %in2) #0 {
731   %tid = call i32 @llvm.amdgcn.workitem.id.x()
732   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
733   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
734   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
736   %r0 = load volatile half, ptr addrspace(1) %gep.0
737   %r1 = load volatile half, ptr addrspace(1) %gep.1
739   %add.0 = fadd half %r0, %r0
740   %add.1 = fadd half %add.0, %r1
741   store half %add.1, ptr addrspace(1) %gep.out
742   ret void
745 define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
746 ; VI-FLUSH-LABEL: fadd_b_a_a_f16:
747 ; VI-FLUSH:       ; %bb.0:
748 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
749 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
750 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
751 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
752 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
753 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
754 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
755 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
756 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
757 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
758 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
759 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
760 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
761 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
762 ; VI-FLUSH-NEXT:    s_endpgm
764 ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
765 ; VI-DENORM-CONTRACT:       ; %bb.0:
766 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
767 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
768 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
769 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
770 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
771 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
772 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
773 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
774 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
775 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
776 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
777 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
778 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
779 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
780 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
782 ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
783 ; GFX10-FLUSH:       ; %bb.0:
784 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
785 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
786 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
787 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
788 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
789 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
790 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
791 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
792 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
793 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
794 ; GFX10-FLUSH-NEXT:    s_endpgm
796 ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
797 ; GFX10-DENORM-STRICT:       ; %bb.0:
798 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
799 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
800 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
801 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
802 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
803 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
804 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
805 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
806 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
807 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
808 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
810 ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
811 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
812 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
813 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
814 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
815 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
816 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
817 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
818 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
819 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
820 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
821 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
823 ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
824 ; GFX11-FLUSH:       ; %bb.0:
825 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
826 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
827 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
828 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
829 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
830 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
831 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
832 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
833 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
834 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
835 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
836 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
837 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
838 ; GFX11-FLUSH-NEXT:    s_endpgm
840 ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
841 ; GFX11-DENORM-STRICT:       ; %bb.0:
842 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
843 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
844 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
845 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
846 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
847 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
848 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
849 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
850 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
851 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
852 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
853 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
854 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
855 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
857 ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
858 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
859 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
860 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
861 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
862 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
863 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
864 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
865 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
866 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
867 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
868 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
869 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
870 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
871                             ptr addrspace(1) %in1,
872                             ptr addrspace(1) %in2) #0 {
873   %tid = call i32 @llvm.amdgcn.workitem.id.x()
874   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
875   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
876   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
878   %r0 = load volatile half, ptr addrspace(1) %gep.0
879   %r1 = load volatile half, ptr addrspace(1) %gep.1
881   %add.0 = fadd half %r0, %r0
882   %add.1 = fadd half %r1, %add.0
883   store half %add.1, ptr addrspace(1) %gep.out
884   ret void
887 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
888 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
889 ; VI-FLUSH:       ; %bb.0:
890 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
891 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
892 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
893 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
894 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
895 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
896 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
897 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
898 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
899 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
900 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
901 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
902 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
903 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
904 ; VI-FLUSH-NEXT:    s_endpgm
906 ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
907 ; VI-DENORM:       ; %bb.0:
908 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
909 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
910 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
911 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
912 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
913 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
914 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
915 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
916 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
917 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
918 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
919 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
920 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
921 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
922 ; VI-DENORM-NEXT:    s_endpgm
924 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
925 ; GFX10-FLUSH:       ; %bb.0:
926 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
927 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
928 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
929 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
930 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
931 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
932 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
933 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
934 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
935 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
936 ; GFX10-FLUSH-NEXT:    s_endpgm
938 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
939 ; GFX10-DENORM:       ; %bb.0:
940 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
941 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
942 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
943 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
944 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
945 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
946 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
947 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
948 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
949 ; GFX10-DENORM-NEXT:    s_endpgm
951 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
952 ; GFX11-FLUSH:       ; %bb.0:
953 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
954 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
955 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
956 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
957 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
958 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
959 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
960 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
961 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
962 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
963 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
964 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
965 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
966 ; GFX11-FLUSH-NEXT:    s_endpgm
968 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
969 ; GFX11-DENORM:       ; %bb.0:
970 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
971 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
972 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
973 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
974 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
975 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
976 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
977 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
978 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
979 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
980 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
981 ; GFX11-DENORM-NEXT:    s_endpgm
982   %tid = call i32 @llvm.amdgcn.workitem.id.x()
983   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
984   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
985   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
987   %r1 = load volatile half, ptr addrspace(1) %gep.0
988   %r2 = load volatile half, ptr addrspace(1) %gep.1
990   %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
991   store half %r3, ptr addrspace(1) %gep.out
992   ret void
995 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
996 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
997 ; VI-FLUSH:       ; %bb.0:
998 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
999 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1000 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1001 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1002 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1003 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1004 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1005 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1006 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1007 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1008 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1009 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1010 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
1011 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1012 ; VI-FLUSH-NEXT:    s_endpgm
1014 ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1015 ; VI-DENORM:       ; %bb.0:
1016 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1017 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1018 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1019 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1020 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1021 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1022 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1023 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1024 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1025 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1026 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1027 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1028 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
1029 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1030 ; VI-DENORM-NEXT:    s_endpgm
1032 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1033 ; GFX10-FLUSH:       ; %bb.0:
1034 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1035 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1036 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1037 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1038 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1039 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1040 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1041 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1042 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
1043 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1044 ; GFX10-FLUSH-NEXT:    s_endpgm
1046 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1047 ; GFX10-DENORM:       ; %bb.0:
1048 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1049 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1050 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1051 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1052 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1053 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1054 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1055 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
1056 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
1057 ; GFX10-DENORM-NEXT:    s_endpgm
1059 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1060 ; GFX11-FLUSH:       ; %bb.0:
1061 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1062 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1063 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1064 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1065 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1066 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1067 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1068 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1069 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1070 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1071 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1072 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
1073 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1074 ; GFX11-FLUSH-NEXT:    s_endpgm
1076 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1077 ; GFX11-DENORM:       ; %bb.0:
1078 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1079 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1080 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1081 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1082 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1083 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1084 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1085 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1086 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1087 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
1088 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
1089 ; GFX11-DENORM-NEXT:    s_endpgm
1090   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1091   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1092   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1093   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1095   %r1 = load volatile half, ptr addrspace(1) %gep.0
1096   %r2 = load volatile half, ptr addrspace(1) %gep.1
1098   %r1.fneg = fneg half %r1
1100   %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
1101   store half %r3, ptr addrspace(1) %gep.out
1102   ret void
1105 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1106 ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1107 ; VI-FLUSH:       ; %bb.0:
1108 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1109 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1110 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1111 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1112 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1113 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1114 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1115 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1116 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1117 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1118 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1119 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1120 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
1121 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1122 ; VI-FLUSH-NEXT:    s_endpgm
1124 ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1125 ; VI-DENORM:       ; %bb.0:
1126 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1127 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1128 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1129 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1130 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1131 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1132 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1133 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1134 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1135 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1136 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1137 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1138 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
1139 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1140 ; VI-DENORM-NEXT:    s_endpgm
1142 ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1143 ; GFX10-FLUSH:       ; %bb.0:
1144 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1145 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1146 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1147 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1148 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1149 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1150 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1151 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1152 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
1153 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1154 ; GFX10-FLUSH-NEXT:    s_endpgm
1156 ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1157 ; GFX10-DENORM:       ; %bb.0:
1158 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1159 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1160 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1161 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1162 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1163 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1164 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1165 ; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
1166 ; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
1167 ; GFX10-DENORM-NEXT:    s_endpgm
1169 ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1170 ; GFX11-FLUSH:       ; %bb.0:
1171 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1172 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1173 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1174 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1175 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1176 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1177 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1178 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1179 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1180 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1181 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1182 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
1183 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1184 ; GFX11-FLUSH-NEXT:    s_endpgm
1186 ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1187 ; GFX11-DENORM:       ; %bb.0:
1188 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1189 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1190 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1191 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1192 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1193 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1194 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1195 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1196 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1197 ; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
1198 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
1199 ; GFX11-DENORM-NEXT:    s_endpgm
1200   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1201   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1202   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1203   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1205   %r1 = load volatile half, ptr addrspace(1) %gep.0
1206   %r2 = load volatile half, ptr addrspace(1) %gep.1
1208   %r1.fneg = fneg half %r1
1210   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
1211   store half %r3, ptr addrspace(1) %gep.out
1212   ret void
1215 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1216 ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1217 ; VI-FLUSH:       ; %bb.0:
1218 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1219 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1220 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1221 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1222 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1223 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1224 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1225 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1226 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
1227 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1228 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1229 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1230 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
1231 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1232 ; VI-FLUSH-NEXT:    s_endpgm
1234 ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1235 ; VI-DENORM:       ; %bb.0:
1236 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1237 ; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1238 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1239 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
1240 ; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1241 ; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1242 ; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1243 ; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1244 ; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
1245 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1246 ; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
1247 ; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
1248 ; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
1249 ; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
1250 ; VI-DENORM-NEXT:    s_endpgm
1252 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1253 ; GFX10-FLUSH:       ; %bb.0:
1254 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1255 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1256 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1257 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1258 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1259 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1260 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1261 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1262 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
1263 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1264 ; GFX10-FLUSH-NEXT:    s_endpgm
1266 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1267 ; GFX10-DENORM:       ; %bb.0:
1268 ; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1269 ; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1270 ; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1271 ; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
1272 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1273 ; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1274 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
1275 ; GFX10-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
1276 ; GFX10-DENORM-NEXT:    global_store_short v0, v1, s[0:1]
1277 ; GFX10-DENORM-NEXT:    s_endpgm
1279 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1280 ; GFX11-FLUSH:       ; %bb.0:
1281 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1282 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1283 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1284 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1285 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1286 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1287 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1288 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1289 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1290 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
1291 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1292 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
1293 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1294 ; GFX11-FLUSH-NEXT:    s_endpgm
1296 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1297 ; GFX11-DENORM:       ; %bb.0:
1298 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
1299 ; GFX11-DENORM-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1300 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1301 ; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1302 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
1303 ; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
1304 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1305 ; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1306 ; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
1307 ; GFX11-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
1308 ; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1]
1309 ; GFX11-DENORM-NEXT:    s_endpgm
1310   %tid = call i32 @llvm.amdgcn.workitem.id.x()
1311   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1312   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1313   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1315   %r1 = load volatile half, ptr addrspace(1) %gep.0
1316   %r2 = load volatile half, ptr addrspace(1) %gep.1
1318   %r2.fneg = fneg half %r2
1320   %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
1321   store half %r3, ptr addrspace(1) %gep.out
1322   ret void
1325 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1326 ; VI-FLUSH-LABEL: mad_sub_f16:
1327 ; VI-FLUSH:       ; %bb.0:
1328 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1329 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1330 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1331 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1332 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1333 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1334 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1335 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1336 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1337 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1338 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1339 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1340 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1341 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1342 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1343 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1344 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1345 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1346 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1347 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -v3
1348 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1349 ; VI-FLUSH-NEXT:    s_endpgm
1351 ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
1352 ; VI-DENORM-CONTRACT:       ; %bb.0:
1353 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1354 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1355 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1356 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1357 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1358 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1359 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1360 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1361 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1362 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1363 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1364 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1365 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1366 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1367 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1368 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1369 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1370 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1371 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1372 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -v3
1373 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1374 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1376 ; GFX10-FLUSH-LABEL: mad_sub_f16:
1377 ; GFX10-FLUSH:       ; %bb.0:
1378 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1379 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1380 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1381 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1382 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1383 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1384 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1385 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1386 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1387 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1388 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
1389 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1390 ; GFX10-FLUSH-NEXT:    s_endpgm
1392 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
1393 ; GFX10-DENORM-STRICT:       ; %bb.0:
1394 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1395 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1396 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1397 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1398 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1399 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1400 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1401 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1402 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1403 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1404 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
1405 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1406 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1408 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
1409 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1410 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1411 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1412 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1413 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1414 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1415 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1416 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1417 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1418 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1419 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
1420 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1421 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1423 ; GFX11-FLUSH-LABEL: mad_sub_f16:
1424 ; GFX11-FLUSH:       ; %bb.0:
1425 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1426 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1427 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1428 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1429 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1430 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1431 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1432 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1433 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1434 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1435 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1436 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1437 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1438 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
1439 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1440 ; GFX11-FLUSH-NEXT:    s_endpgm
1442 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
1443 ; GFX11-DENORM-STRICT:       ; %bb.0:
1444 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1445 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1446 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1447 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1448 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1449 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1450 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1451 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1452 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1453 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1454 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1455 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1456 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1457 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
1458 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1459 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1461 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
1462 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1463 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1464 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1465 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1466 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1467 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1468 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1469 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1470 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1471 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1472 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1473 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1474 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
1475 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1476 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1477   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1478   %tid.ext = sext i32 %tid to i64
1479   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1480   %add1 = add i64 %tid.ext, 1
1481   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1482   %add2 = add i64 %tid.ext, 2
1483   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1484   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1485   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1486   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1487   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1488   %mul = fmul half %a, %b
1489   %sub = fsub half %mul, %c
1490   store half %sub, ptr addrspace(1) %outgep, align 2
1491   ret void
1494 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1495 ; VI-FLUSH-LABEL: mad_sub_inv_f16:
1496 ; VI-FLUSH:       ; %bb.0:
1497 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1498 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1499 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1500 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1501 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1502 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1503 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1504 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1505 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1506 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1507 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1508 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1509 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1510 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1511 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1512 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1513 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1514 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1515 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1516 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, v3
1517 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1518 ; VI-FLUSH-NEXT:    s_endpgm
1520 ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1521 ; VI-DENORM-CONTRACT:       ; %bb.0:
1522 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1523 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1524 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1525 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1526 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1527 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1528 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1529 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1530 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1531 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1532 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1533 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1534 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1535 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1536 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1537 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1538 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1539 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1540 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1541 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, v3
1542 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1543 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1545 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
1546 ; GFX10-FLUSH:       ; %bb.0:
1547 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1548 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1549 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1550 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1551 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1552 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1553 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1554 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1555 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1556 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1557 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
1558 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1559 ; GFX10-FLUSH-NEXT:    s_endpgm
1561 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1562 ; GFX10-DENORM-STRICT:       ; %bb.0:
1563 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1564 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1565 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1566 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1567 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1568 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1569 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1570 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1571 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1572 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1573 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
1574 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1575 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1577 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1578 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1579 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1580 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1581 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1582 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1583 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1584 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1585 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1586 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1587 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1588 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
1589 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1590 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1592 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
1593 ; GFX11-FLUSH:       ; %bb.0:
1594 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1595 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1596 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1597 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1598 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1599 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1600 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1601 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1602 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1603 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1604 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1605 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1606 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1607 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
1608 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1609 ; GFX11-FLUSH-NEXT:    s_endpgm
1611 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1612 ; GFX11-DENORM-STRICT:       ; %bb.0:
1613 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1614 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1615 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1616 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1617 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1618 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1619 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1620 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1621 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1622 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1623 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1624 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1625 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1626 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
1627 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1628 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1630 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1631 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1632 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1633 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1634 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1635 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1636 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1637 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1638 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1639 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1640 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1641 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1642 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1643 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
1644 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1645 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1646   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1647   %tid.ext = sext i32 %tid to i64
1648   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1649   %add1 = add i64 %tid.ext, 1
1650   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1651   %add2 = add i64 %tid.ext, 2
1652   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1653   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1654   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1655   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1656   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1657   %mul = fmul half %a, %b
1658   %sub = fsub half %c, %mul
1659   store half %sub, ptr addrspace(1) %outgep, align 2
1660   ret void
1663 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1664 ; VI-FLUSH-LABEL: mad_sub_fabs_f16:
1665 ; VI-FLUSH:       ; %bb.0:
1666 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1667 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1668 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1669 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1670 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1671 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1672 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1673 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1674 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1675 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1676 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1677 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1678 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1679 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1680 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1681 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1682 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1683 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1684 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1685 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -|v3|
1686 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1687 ; VI-FLUSH-NEXT:    s_endpgm
1689 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1690 ; VI-DENORM-CONTRACT:       ; %bb.0:
1691 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1692 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1693 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1694 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1695 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1696 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1697 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1698 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1699 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1700 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1701 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1702 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1703 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1704 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1705 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1706 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1707 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1708 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1709 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1710 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -|v3|
1711 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1712 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1714 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
1715 ; GFX10-FLUSH:       ; %bb.0:
1716 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1717 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1718 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1719 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1720 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1721 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1722 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1723 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1724 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1725 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1726 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1727 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1728 ; GFX10-FLUSH-NEXT:    s_endpgm
1730 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1731 ; GFX10-DENORM-STRICT:       ; %bb.0:
1732 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1733 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1734 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1735 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1736 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1737 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1738 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1739 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1740 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1741 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1742 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1743 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1744 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1746 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1747 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1748 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1749 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1750 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1751 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1752 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1753 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1754 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1755 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1756 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1757 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
1758 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1759 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1761 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
1762 ; GFX11-FLUSH:       ; %bb.0:
1763 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1764 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1765 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1766 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1767 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1768 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1769 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1770 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1771 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1772 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1773 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1774 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1775 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1776 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1777 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1778 ; GFX11-FLUSH-NEXT:    s_endpgm
1780 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1781 ; GFX11-DENORM-STRICT:       ; %bb.0:
1782 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1783 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1784 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1785 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1786 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1787 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1788 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1789 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1790 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1791 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1792 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1793 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1794 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1795 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
1796 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1797 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1799 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1800 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1801 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1802 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1803 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1804 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1805 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1806 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1807 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1808 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1809 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1810 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1811 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1812 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
1813 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1814 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1815   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1816   %tid.ext = sext i32 %tid to i64
1817   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1818   %add1 = add i64 %tid.ext, 1
1819   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1820   %add2 = add i64 %tid.ext, 2
1821   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1822   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1823   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1824   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1825   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1826   %c.abs = call half @llvm.fabs.f16(half %c) #0
1827   %mul = fmul half %a, %b
1828   %sub = fsub half %mul, %c.abs
1829   store half %sub, ptr addrspace(1) %outgep, align 2
1830   ret void
1833 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1834 ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1835 ; VI-FLUSH:       ; %bb.0:
1836 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1837 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1838 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1839 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
1840 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1841 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1842 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1843 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1844 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1845 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1846 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
1847 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1848 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
1849 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1850 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
1851 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1852 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
1853 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1854 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1855 ; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, |v3|
1856 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
1857 ; VI-FLUSH-NEXT:    s_endpgm
1859 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1860 ; VI-DENORM-CONTRACT:       ; %bb.0:
1861 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1862 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
1863 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1864 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
1865 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
1866 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1867 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1868 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1869 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
1870 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1871 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
1872 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1873 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
1874 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1875 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
1876 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1877 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
1878 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
1879 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1880 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, |v3|
1881 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
1882 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
1884 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1885 ; GFX10-FLUSH:       ; %bb.0:
1886 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1887 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1888 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1889 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1890 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1891 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1892 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1893 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1894 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1895 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1896 ; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1897 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
1898 ; GFX10-FLUSH-NEXT:    s_endpgm
1900 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
1901 ; GFX10-DENORM-STRICT:       ; %bb.0:
1902 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1903 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1904 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1905 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1906 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1907 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1908 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1909 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1910 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1911 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1912 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1913 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
1914 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
1916 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1917 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
1918 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1919 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1920 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1921 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
1922 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1923 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
1924 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1925 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
1926 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1927 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
1928 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
1929 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
1931 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1932 ; GFX11-FLUSH:       ; %bb.0:
1933 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1934 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1935 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1936 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1937 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
1938 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1939 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1940 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1941 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1942 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1943 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
1944 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
1945 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1946 ; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1947 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
1948 ; GFX11-FLUSH-NEXT:    s_endpgm
1950 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
1951 ; GFX11-DENORM-STRICT:       ; %bb.0:
1952 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1953 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1954 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1955 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1956 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
1957 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1958 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1959 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1960 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1961 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1962 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
1963 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
1964 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1965 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
1966 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
1967 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
1969 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1970 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
1971 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1972 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1973 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1974 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1975 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
1976 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
1977 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1978 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1979 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1980 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1981 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
1982 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
1983 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
1984 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
1985   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1986   %tid.ext = sext i32 %tid to i64
1987   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1988   %add1 = add i64 %tid.ext, 1
1989   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1990   %add2 = add i64 %tid.ext, 2
1991   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1992   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1993   %a = load volatile half, ptr addrspace(1) %gep0, align 2
1994   %b = load volatile half, ptr addrspace(1) %gep1, align 2
1995   %c = load volatile half, ptr addrspace(1) %gep2, align 2
1996   %c.abs = call half @llvm.fabs.f16(half %c) #0
1997   %mul = fmul half %a, %b
1998   %sub = fsub half %c.abs, %mul
1999   store half %sub, ptr addrspace(1) %outgep, align 2
2000   ret void
2003 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2004 ; VI-FLUSH-LABEL: neg_neg_mad_f16:
2005 ; VI-FLUSH:       ; %bb.0:
2006 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2007 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2008 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2009 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
2010 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2011 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2012 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2013 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2014 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2015 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2016 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
2017 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2018 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2019 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2020 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
2021 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2022 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2023 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2024 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2025 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v7, v2
2026 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
2027 ; VI-FLUSH-NEXT:    s_endpgm
2029 ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2030 ; VI-DENORM-CONTRACT:       ; %bb.0:
2031 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2032 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2033 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2034 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
2035 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2036 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2037 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2038 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2039 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2040 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2041 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
2042 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2043 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2044 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2045 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
2046 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2047 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2048 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2049 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2050 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, v3
2051 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2052 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2054 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
2055 ; GFX10-FLUSH:       ; %bb.0:
2056 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2057 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2058 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2059 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2060 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2061 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2062 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2063 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2064 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2065 ; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
2066 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
2067 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2068 ; GFX10-FLUSH-NEXT:    s_endpgm
2070 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2071 ; GFX10-DENORM-STRICT:       ; %bb.0:
2072 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2073 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2074 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2075 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2076 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2077 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2078 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2079 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2080 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2081 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
2082 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
2083 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2084 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2086 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2087 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2088 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2089 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2090 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2091 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2092 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2093 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2094 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2095 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2096 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2097 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
2098 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
2099 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2101 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
2102 ; GFX11-FLUSH:       ; %bb.0:
2103 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2104 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2105 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2106 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2107 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2108 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2109 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2110 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2111 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2112 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2113 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2114 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
2115 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2116 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
2117 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2118 ; GFX11-FLUSH-NEXT:    s_endpgm
2120 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2121 ; GFX11-DENORM-STRICT:       ; %bb.0:
2122 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2123 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2124 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2125 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2126 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2127 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2128 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2129 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2130 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2131 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2132 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2133 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
2134 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2135 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
2136 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2137 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2139 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2140 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2141 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2142 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2143 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2144 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2145 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2146 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2147 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2148 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2149 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2150 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2151 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2152 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
2153 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
2154 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2155   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2156   %tid.ext = sext i32 %tid to i64
2157   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2158   %add1 = add i64 %tid.ext, 1
2159   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2160   %add2 = add i64 %tid.ext, 2
2161   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2162   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2163   %a = load volatile half, ptr addrspace(1) %gep0, align 2
2164   %b = load volatile half, ptr addrspace(1) %gep1, align 2
2165   %c = load volatile half, ptr addrspace(1) %gep2, align 2
2166   %nega = fneg half %a
2167   %negb = fneg half %b
2168   %mul = fmul half %nega, %negb
2169   %sub = fadd half %mul, %c
2170   store half %sub, ptr addrspace(1) %outgep, align 2
2171   ret void
2174 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2175 ; VI-FLUSH-LABEL: mad_fabs_sub_f16:
2176 ; VI-FLUSH:       ; %bb.0:
2177 ; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2178 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2179 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2180 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
2181 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2182 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2183 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2184 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2185 ; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2186 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2187 ; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
2188 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2189 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2190 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2191 ; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
2192 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2193 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2194 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2195 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2196 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, |v2|, -v3
2197 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2198 ; VI-FLUSH-NEXT:    s_endpgm
2200 ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2201 ; VI-DENORM-CONTRACT:       ; %bb.0:
2202 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2203 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
2204 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2205 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
2206 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
2207 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2208 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2209 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2210 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
2211 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2212 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
2213 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2214 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2215 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2216 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
2217 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2218 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2219 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
2220 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2221 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, |v2|, -v3
2222 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2223 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2225 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
2226 ; GFX10-FLUSH:       ; %bb.0:
2227 ; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2228 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2229 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2230 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2231 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2232 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2233 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2234 ; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2235 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2236 ; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2237 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
2238 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2239 ; GFX10-FLUSH-NEXT:    s_endpgm
2241 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2242 ; GFX10-DENORM-STRICT:       ; %bb.0:
2243 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2244 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2245 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2246 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2247 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2248 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2249 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2250 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2251 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2252 ; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2253 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
2254 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2255 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2257 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2258 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2259 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2260 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2261 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2262 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
2263 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2264 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
2265 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2266 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
2267 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2268 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
2269 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
2270 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2272 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
2273 ; GFX11-FLUSH:       ; %bb.0:
2274 ; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2275 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2276 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2277 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2278 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2279 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2280 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2281 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2282 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2283 ; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2284 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2285 ; GFX11-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2286 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2287 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
2288 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2289 ; GFX11-FLUSH-NEXT:    s_endpgm
2291 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2292 ; GFX11-DENORM-STRICT:       ; %bb.0:
2293 ; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2294 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2295 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2296 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2297 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2298 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2299 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2300 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2301 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2302 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2303 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2304 ; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
2305 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2306 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
2307 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2308 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2310 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2311 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2312 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2313 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2314 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2315 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2316 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2317 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
2318 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2319 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2320 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2321 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2322 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2323 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
2324 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
2325 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2326   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2327   %tid.ext = sext i32 %tid to i64
2328   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2329   %add1 = add i64 %tid.ext, 1
2330   %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2331   %add2 = add i64 %tid.ext, 2
2332   %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2333   %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2334   %a = load volatile half, ptr addrspace(1) %gep0, align 2
2335   %b = load volatile half, ptr addrspace(1) %gep1, align 2
2336   %c = load volatile half, ptr addrspace(1) %gep2, align 2
2337   %b.abs = call half @llvm.fabs.f16(half %b) #0
2338   %mul = fmul half %a, %b.abs
2339   %sub = fsub half %mul, %c
2340   store half %sub, ptr addrspace(1) %outgep, align 2
2341   ret void
2344 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2345 ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2346 ; VI-FLUSH:       ; %bb.0:
2347 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2348 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2349 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2350 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2351 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2352 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2353 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2354 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2355 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
2356 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2357 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2358 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2359 ; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
2360 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2361 ; VI-FLUSH-NEXT:    s_endpgm
2363 ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2364 ; VI-DENORM-CONTRACT:       ; %bb.0:
2365 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2366 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2367 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2368 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2369 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2370 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2371 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2372 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2373 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
2374 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2375 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2376 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2377 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, -2.0, v2
2378 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2379 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2381 ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2382 ; GFX10-FLUSH:       ; %bb.0:
2383 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2384 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2385 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2386 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2387 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2388 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2389 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2390 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2391 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
2392 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2393 ; GFX10-FLUSH-NEXT:    s_endpgm
2395 ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2396 ; GFX10-DENORM-STRICT:       ; %bb.0:
2397 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2398 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2399 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2400 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2401 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2402 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2403 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2404 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2405 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
2406 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2407 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2409 ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2410 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2411 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2412 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2413 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2414 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2415 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2416 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2417 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2418 ; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
2419 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
2420 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2422 ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2423 ; GFX11-FLUSH:       ; %bb.0:
2424 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2425 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2426 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2427 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2428 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2429 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2430 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2431 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2432 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2433 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2434 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2435 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
2436 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2437 ; GFX11-FLUSH-NEXT:    s_endpgm
2439 ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2440 ; GFX11-DENORM-STRICT:       ; %bb.0:
2441 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2442 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2443 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2444 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2445 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2446 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2447 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2448 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2449 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2450 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2451 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2452 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
2453 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2454 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2456 ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2457 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2458 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2459 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2460 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2461 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2462 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2463 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2464 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2465 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2466 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2467 ; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
2468 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
2469 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2470   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2471   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2472   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2473   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2475   %r1 = load volatile half, ptr addrspace(1) %gep.0
2476   %r2 = load volatile half, ptr addrspace(1) %gep.1
2478   %add = fadd half %r1, %r1
2479   %r3 = fsub half %r2, %add
2481   store half %r3, ptr addrspace(1) %gep.out
2482   ret void
2485 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2486 ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2487 ; VI-FLUSH:       ; %bb.0:
2488 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2489 ; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2490 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2491 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
2492 ; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2493 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2494 ; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2495 ; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2496 ; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
2497 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2498 ; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
2499 ; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2500 ; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
2501 ; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
2502 ; VI-FLUSH-NEXT:    s_endpgm
2504 ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2505 ; VI-DENORM-CONTRACT:       ; %bb.0:
2506 ; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2507 ; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2508 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2509 ; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
2510 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2511 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2512 ; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2513 ; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2514 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
2515 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2516 ; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
2517 ; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2518 ; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
2519 ; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
2520 ; VI-DENORM-CONTRACT-NEXT:    s_endpgm
2522 ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2523 ; GFX10-FLUSH:       ; %bb.0:
2524 ; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2525 ; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2526 ; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2527 ; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2528 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2529 ; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2530 ; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2531 ; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2532 ; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
2533 ; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
2534 ; GFX10-FLUSH-NEXT:    s_endpgm
2536 ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2537 ; GFX10-DENORM-STRICT:       ; %bb.0:
2538 ; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2539 ; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2540 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2541 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2542 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2543 ; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2544 ; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2545 ; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2546 ; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
2547 ; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
2548 ; GFX10-DENORM-STRICT-NEXT:    s_endpgm
2550 ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2551 ; GFX10-DENORM-CONTRACT:       ; %bb.0:
2552 ; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2553 ; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2554 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2555 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
2556 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2557 ; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2558 ; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2559 ; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
2560 ; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
2561 ; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
2563 ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2564 ; GFX11-FLUSH:       ; %bb.0:
2565 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2566 ; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2567 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2568 ; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2569 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
2570 ; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2571 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2572 ; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2573 ; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
2574 ; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
2575 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2576 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
2577 ; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
2578 ; GFX11-FLUSH-NEXT:    s_endpgm
2580 ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2581 ; GFX11-DENORM-STRICT:       ; %bb.0:
2582 ; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2583 ; GFX11-DENORM-STRICT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2584 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2585 ; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2586 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
2587 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2588 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2589 ; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2590 ; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
2591 ; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
2592 ; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2593 ; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
2594 ; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
2595 ; GFX11-DENORM-STRICT-NEXT:    s_endpgm
2597 ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2598 ; GFX11-DENORM-CONTRACT:       ; %bb.0:
2599 ; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
2600 ; GFX11-DENORM-CONTRACT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2601 ; GFX11-DENORM-CONTRACT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2602 ; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
2603 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
2604 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
2605 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2606 ; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2607 ; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
2608 ; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
2609 ; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
2610 ; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
2611   %tid = call i32 @llvm.amdgcn.workitem.id.x()
2612   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2613   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2614   %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2616   %r1 = load volatile half, ptr addrspace(1) %gep.0
2617   %r2 = load volatile half, ptr addrspace(1) %gep.1
2619   %add = fadd half %r1, %r1
2620   %r3 = fsub half %add, %r2
2622   store half %r3, ptr addrspace(1) %gep.out
2623   ret void
2626 attributes #0 = { nounwind }
2627 attributes #1 = { nounwind readnone }