1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
10 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
11 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
13 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
14 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
15 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s
16 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s
18 declare i32 @llvm.amdgcn.workitem.id.x() #1
19 declare half @llvm.fmuladd.f16(half, half, half) #1
20 declare half @llvm.fabs.f16(half) #1
22 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
23 ; VI-FLUSH-LABEL: fmuladd_f16:
25 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
26 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
27 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
28 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
29 ; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
30 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
31 ; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
32 ; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
33 ; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
34 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
35 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
36 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
37 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
38 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
39 ; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
40 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
41 ; VI-FLUSH-NEXT: s_endpgm
43 ; VI-DENORM-LABEL: fmuladd_f16:
45 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
46 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
47 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
48 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
49 ; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
50 ; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
51 ; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
52 ; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
53 ; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
54 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
55 ; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
56 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
57 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
58 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
59 ; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
60 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
61 ; VI-DENORM-NEXT: s_endpgm
63 ; GFX10-FLUSH-LABEL: fmuladd_f16:
64 ; GFX10-FLUSH: ; %bb.0:
65 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
66 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
67 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX10-FLUSH-NEXT: s_clause 0x2
69 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
70 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
71 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
72 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
73 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
74 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
75 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
76 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
77 ; GFX10-FLUSH-NEXT: s_endpgm
79 ; GFX10-DENORM-LABEL: fmuladd_f16:
80 ; GFX10-DENORM: ; %bb.0:
81 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
82 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
83 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
84 ; GFX10-DENORM-NEXT: s_clause 0x2
85 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
86 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
87 ; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
88 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
89 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
90 ; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
91 ; GFX10-DENORM-NEXT: s_endpgm
93 ; GFX11-FLUSH-LABEL: fmuladd_f16:
94 ; GFX11-FLUSH: ; %bb.0:
95 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
96 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
97 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
98 ; GFX11-FLUSH-NEXT: s_clause 0x2
99 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
100 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
101 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
102 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
103 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
104 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
105 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
106 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
107 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
108 ; GFX11-FLUSH-NEXT: s_nop 0
109 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
110 ; GFX11-FLUSH-NEXT: s_endpgm
112 ; GFX11-DENORM-LABEL: fmuladd_f16:
113 ; GFX11-DENORM: ; %bb.0:
114 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
115 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
116 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX11-DENORM-NEXT: s_clause 0x2
118 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
119 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
120 ; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
121 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
122 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
123 ; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
124 ; GFX11-DENORM-NEXT: s_nop 0
125 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
126 ; GFX11-DENORM-NEXT: s_endpgm
127 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
128 %r0 = load half, ptr addrspace(1) %in1
129 %r1 = load half, ptr addrspace(1) %in2
130 %r2 = load half, ptr addrspace(1) %in3
131 %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
132 store half %r3, ptr addrspace(1) %out
136 define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
137 ; VI-FLUSH-LABEL: fmul_fadd_f16:
139 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
140 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
141 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
142 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
143 ; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
144 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
145 ; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
146 ; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
147 ; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
148 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
149 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
150 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
151 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
152 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
153 ; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
154 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
155 ; VI-FLUSH-NEXT: s_endpgm
157 ; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
158 ; VI-DENORM-CONTRACT: ; %bb.0:
159 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
160 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
161 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2
162 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
163 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s4
164 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s5
165 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s6
166 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s7
167 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v6, v[0:1]
168 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3]
169 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5]
170 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s0
171 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
172 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
173 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v6, v2, v3
174 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
175 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
177 ; GFX10-FLUSH-LABEL: fmul_fadd_f16:
178 ; GFX10-FLUSH: ; %bb.0:
179 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
180 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
181 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX10-FLUSH-NEXT: s_clause 0x2
183 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
184 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
185 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
186 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
187 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
188 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
189 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
190 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
191 ; GFX10-FLUSH-NEXT: s_endpgm
193 ; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
194 ; GFX10-DENORM-STRICT: ; %bb.0:
195 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
196 ; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
197 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX10-DENORM-STRICT-NEXT: s_clause 0x2
199 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7]
200 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9]
201 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11]
202 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
203 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
204 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
205 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
206 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
207 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
209 ; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
210 ; GFX10-DENORM-CONTRACT: ; %bb.0:
211 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
212 ; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
213 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2
215 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7]
216 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9]
217 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11]
218 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
219 ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
220 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
221 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
223 ; GFX11-FLUSH-LABEL: fmul_fadd_f16:
224 ; GFX11-FLUSH: ; %bb.0:
225 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
226 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
227 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
228 ; GFX11-FLUSH-NEXT: s_clause 0x2
229 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
230 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
231 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
232 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
233 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
234 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
235 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
236 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
237 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
238 ; GFX11-FLUSH-NEXT: s_nop 0
239 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
240 ; GFX11-FLUSH-NEXT: s_endpgm
242 ; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
243 ; GFX11-DENORM-STRICT: ; %bb.0:
244 ; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
245 ; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
246 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX11-DENORM-STRICT-NEXT: s_clause 0x2
248 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3]
249 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5]
250 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7]
251 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
252 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
253 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
255 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
256 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
257 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
258 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
259 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
261 ; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
262 ; GFX11-DENORM-CONTRACT: ; %bb.0:
263 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
264 ; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
265 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2
267 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3]
268 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5]
269 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7]
270 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
271 ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
272 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
273 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
274 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
275 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
276 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
277 %r0 = load half, ptr addrspace(1) %in1
278 %r1 = load half, ptr addrspace(1) %in2
279 %r2 = load half, ptr addrspace(1) %in3
280 %mul = fmul half %r0, %r1
281 %add = fadd half %mul, %r2
282 store half %add, ptr addrspace(1) %out
286 define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
287 ; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
289 ; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
290 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
291 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
292 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
293 ; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
294 ; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
295 ; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
296 ; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
297 ; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
298 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
299 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
300 ; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
301 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
302 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
303 ; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
304 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
305 ; VI-FLUSH-NEXT: s_endpgm
307 ; VI-DENORM-LABEL: fmul_fadd_contract_f16:
308 ; VI-DENORM: ; %bb.0:
309 ; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
310 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
311 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
312 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
313 ; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
314 ; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
315 ; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
316 ; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
317 ; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
318 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
319 ; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
320 ; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
321 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
322 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
323 ; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
324 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
325 ; VI-DENORM-NEXT: s_endpgm
327 ; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
328 ; GFX10-FLUSH: ; %bb.0:
329 ; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
330 ; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
331 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX10-FLUSH-NEXT: s_clause 0x2
333 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
334 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
335 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
336 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
337 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
338 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
339 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
340 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
341 ; GFX10-FLUSH-NEXT: s_endpgm
343 ; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
344 ; GFX10-DENORM: ; %bb.0:
345 ; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
346 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
347 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
348 ; GFX10-DENORM-NEXT: s_clause 0x2
349 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
350 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
351 ; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
352 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
353 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
354 ; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
355 ; GFX10-DENORM-NEXT: s_endpgm
357 ; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
358 ; GFX11-FLUSH: ; %bb.0:
359 ; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
360 ; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
361 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
362 ; GFX11-FLUSH-NEXT: s_clause 0x2
363 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
364 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
365 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
366 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
367 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
368 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
369 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
370 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
371 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
372 ; GFX11-FLUSH-NEXT: s_nop 0
373 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
374 ; GFX11-FLUSH-NEXT: s_endpgm
376 ; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
377 ; GFX11-DENORM: ; %bb.0:
378 ; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
379 ; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
380 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
381 ; GFX11-DENORM-NEXT: s_clause 0x2
382 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
383 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
384 ; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
385 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
386 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
387 ; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
388 ; GFX11-DENORM-NEXT: s_nop 0
389 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
390 ; GFX11-DENORM-NEXT: s_endpgm
391 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
392 %r0 = load half, ptr addrspace(1) %in1
393 %r1 = load half, ptr addrspace(1) %in2
394 %r2 = load half, ptr addrspace(1) %in3
395 %mul = fmul contract half %r0, %r1
396 %add = fadd contract half %mul, %r2
397 store half %add, ptr addrspace(1) %out
401 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
402 ; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
404 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
405 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
406 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
407 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
408 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
409 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
410 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
411 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
412 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
413 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
414 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
415 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
416 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4
417 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
418 ; VI-FLUSH-NEXT: s_endpgm
420 ; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
421 ; VI-DENORM: ; %bb.0:
422 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
423 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
424 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
425 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
426 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
427 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
428 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
429 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
430 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
431 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
432 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
433 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
434 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2
435 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
436 ; VI-DENORM-NEXT: s_endpgm
438 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
439 ; GFX10-FLUSH: ; %bb.0:
440 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
441 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
442 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
443 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
444 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
445 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
446 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
447 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
448 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
449 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
450 ; GFX10-FLUSH-NEXT: s_endpgm
452 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
453 ; GFX10-DENORM: ; %bb.0:
454 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
455 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
456 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
457 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
458 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
459 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
460 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
461 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
462 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
463 ; GFX10-DENORM-NEXT: s_endpgm
465 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
466 ; GFX11-FLUSH: ; %bb.0:
467 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
468 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
469 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
470 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
471 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
473 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
474 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
475 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
476 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
477 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
478 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
479 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
480 ; GFX11-FLUSH-NEXT: s_nop 0
481 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
482 ; GFX11-FLUSH-NEXT: s_endpgm
484 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
485 ; GFX11-DENORM: ; %bb.0:
486 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
487 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
488 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
489 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
490 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
492 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
493 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
494 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
495 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
496 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
497 ; GFX11-DENORM-NEXT: s_nop 0
498 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
499 ; GFX11-DENORM-NEXT: s_endpgm
500 %tid = call i32 @llvm.amdgcn.workitem.id.x()
501 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
502 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
503 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
505 %r1 = load volatile half, ptr addrspace(1) %gep.0
506 %r2 = load volatile half, ptr addrspace(1) %gep.1
508 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
509 store half %r3, ptr addrspace(1) %gep.out
513 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
514 ; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
516 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
517 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
518 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
519 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
520 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
521 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
522 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
523 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
524 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
525 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
526 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
527 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
528 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4
529 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
530 ; VI-FLUSH-NEXT: s_endpgm
532 ; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
533 ; VI-DENORM: ; %bb.0:
534 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
535 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
536 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
537 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
538 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
539 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
540 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
541 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
542 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
543 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
544 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
545 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
546 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2
547 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
548 ; VI-DENORM-NEXT: s_endpgm
550 ; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
551 ; GFX10-FLUSH: ; %bb.0:
552 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
553 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
554 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
555 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
556 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
557 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
558 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
559 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
560 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
561 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
562 ; GFX10-FLUSH-NEXT: s_endpgm
564 ; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
565 ; GFX10-DENORM: ; %bb.0:
566 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
567 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
568 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
570 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
571 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
572 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
573 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
574 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
575 ; GFX10-DENORM-NEXT: s_endpgm
577 ; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
578 ; GFX11-FLUSH: ; %bb.0:
579 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
580 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
581 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
582 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
583 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
584 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
585 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
586 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
587 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
588 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
589 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
590 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
591 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
592 ; GFX11-FLUSH-NEXT: s_nop 0
593 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
594 ; GFX11-FLUSH-NEXT: s_endpgm
596 ; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
597 ; GFX11-DENORM: ; %bb.0:
598 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
599 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
600 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
601 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
602 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
603 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
604 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
605 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
606 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
607 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
608 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
609 ; GFX11-DENORM-NEXT: s_nop 0
610 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
611 ; GFX11-DENORM-NEXT: s_endpgm
612 %tid = call i32 @llvm.amdgcn.workitem.id.x()
613 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
614 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
615 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
617 %r1 = load volatile half, ptr addrspace(1) %gep.0
618 %r2 = load volatile half, ptr addrspace(1) %gep.1
620 %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
621 store half %r3, ptr addrspace(1) %gep.out
625 define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
626 ; VI-FLUSH-LABEL: fadd_a_a_b_f16:
628 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
629 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
630 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
631 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
632 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
633 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
634 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
635 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
636 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
637 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
638 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
639 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
640 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4
641 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
642 ; VI-FLUSH-NEXT: s_endpgm
644 ; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
645 ; VI-DENORM-CONTRACT: ; %bb.0:
646 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
647 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
648 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
649 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
650 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
651 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
652 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
653 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
654 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc
655 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
656 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
657 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
658 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, v2
659 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
660 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
662 ; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
663 ; GFX10-FLUSH: ; %bb.0:
664 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
665 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
666 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
667 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
668 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
669 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
670 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
671 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
672 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
673 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
674 ; GFX10-FLUSH-NEXT: s_endpgm
676 ; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
677 ; GFX10-DENORM-STRICT: ; %bb.0:
678 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
679 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
680 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
681 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
682 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
683 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
684 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
685 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
686 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
687 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
688 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
690 ; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
691 ; GFX10-DENORM-CONTRACT: ; %bb.0:
692 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
693 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
694 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
695 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
696 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
697 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
698 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
699 ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
700 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
701 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
703 ; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
704 ; GFX11-FLUSH: ; %bb.0:
705 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
706 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
707 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
708 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
709 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
710 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
711 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
712 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
713 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
714 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
715 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
716 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
717 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
718 ; GFX11-FLUSH-NEXT: s_nop 0
719 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
720 ; GFX11-FLUSH-NEXT: s_endpgm
722 ; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
723 ; GFX11-DENORM-STRICT: ; %bb.0:
724 ; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
725 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
726 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
727 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
728 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
729 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
730 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
731 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
732 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
733 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
734 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
735 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
736 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
737 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
738 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
739 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
741 ; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
742 ; GFX11-DENORM-CONTRACT: ; %bb.0:
743 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
744 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
745 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
746 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
747 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
749 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
750 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
751 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
752 ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
753 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
754 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
755 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
756 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
757 ptr addrspace(1) %in1,
758 ptr addrspace(1) %in2) #0 {
759 %tid = call i32 @llvm.amdgcn.workitem.id.x()
760 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
761 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
762 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
764 %r0 = load volatile half, ptr addrspace(1) %gep.0
765 %r1 = load volatile half, ptr addrspace(1) %gep.1
767 %add.0 = fadd half %r0, %r0
768 %add.1 = fadd half %add.0, %r1
769 store half %add.1, ptr addrspace(1) %gep.out
773 define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
774 ; VI-FLUSH-LABEL: fadd_b_a_a_f16:
776 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
777 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
778 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
779 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
780 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
781 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
782 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
783 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
784 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
785 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
786 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
787 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
788 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4
789 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
790 ; VI-FLUSH-NEXT: s_endpgm
792 ; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
793 ; VI-DENORM-CONTRACT: ; %bb.0:
794 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
795 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
796 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
797 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
798 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
799 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
800 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
801 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
802 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc
803 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
804 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
805 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
806 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, v2
807 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
808 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
810 ; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
811 ; GFX10-FLUSH: ; %bb.0:
812 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
813 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
814 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
815 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
816 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
817 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
818 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
819 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
820 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
821 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
822 ; GFX10-FLUSH-NEXT: s_endpgm
824 ; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
825 ; GFX10-DENORM-STRICT: ; %bb.0:
826 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
827 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
828 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
829 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
830 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
831 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
832 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
833 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
834 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
835 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
836 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
838 ; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
839 ; GFX10-DENORM-CONTRACT: ; %bb.0:
840 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
841 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
842 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
843 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
844 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
845 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
846 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
847 ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
848 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
849 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
851 ; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
852 ; GFX11-FLUSH: ; %bb.0:
853 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
854 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
855 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
856 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
857 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
858 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
859 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
860 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
861 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
862 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
863 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
864 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
865 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
866 ; GFX11-FLUSH-NEXT: s_nop 0
867 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
868 ; GFX11-FLUSH-NEXT: s_endpgm
870 ; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
871 ; GFX11-DENORM-STRICT: ; %bb.0:
872 ; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
873 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
874 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
875 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
876 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
877 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
878 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
879 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
880 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
881 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
882 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
883 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
884 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
885 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
886 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
887 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
889 ; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
890 ; GFX11-DENORM-CONTRACT: ; %bb.0:
891 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
892 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
893 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
894 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
895 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
896 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
897 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
898 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
899 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
900 ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
901 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
902 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
903 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
904 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
905 ptr addrspace(1) %in1,
906 ptr addrspace(1) %in2) #0 {
907 %tid = call i32 @llvm.amdgcn.workitem.id.x()
908 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
909 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
910 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
912 %r0 = load volatile half, ptr addrspace(1) %gep.0
913 %r1 = load volatile half, ptr addrspace(1) %gep.1
915 %add.0 = fadd half %r0, %r0
916 %add.1 = fadd half %r1, %add.0
917 store half %add.1, ptr addrspace(1) %gep.out
921 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
922 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
924 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
925 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
926 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
927 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
928 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
929 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
930 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
931 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
932 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
933 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
934 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
935 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
936 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4
937 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
938 ; VI-FLUSH-NEXT: s_endpgm
940 ; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
941 ; VI-DENORM: ; %bb.0:
942 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
943 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
944 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
945 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
946 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
947 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
948 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
949 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
950 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
951 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
952 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
953 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
954 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, -2.0, v2
955 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
956 ; VI-DENORM-NEXT: s_endpgm
958 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
959 ; GFX10-FLUSH: ; %bb.0:
960 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
961 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
962 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
963 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
964 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
965 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
966 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
967 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
968 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
969 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
970 ; GFX10-FLUSH-NEXT: s_endpgm
972 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
973 ; GFX10-DENORM: ; %bb.0:
974 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
975 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
976 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
977 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
978 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
979 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
980 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
981 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
982 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
983 ; GFX10-DENORM-NEXT: s_endpgm
985 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
986 ; GFX11-FLUSH: ; %bb.0:
987 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
988 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
989 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
990 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
991 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
992 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
993 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
994 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
995 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
996 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
997 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
998 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
999 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1000 ; GFX11-FLUSH-NEXT: s_nop 0
1001 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1002 ; GFX11-FLUSH-NEXT: s_endpgm
1004 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
1005 ; GFX11-DENORM: ; %bb.0:
1006 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1007 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1008 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
1009 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1010 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1011 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1012 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1013 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1014 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1015 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
1016 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
1017 ; GFX11-DENORM-NEXT: s_nop 0
1018 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1019 ; GFX11-DENORM-NEXT: s_endpgm
1020 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1021 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1022 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1023 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1025 %r1 = load volatile half, ptr addrspace(1) %gep.0
1026 %r2 = load volatile half, ptr addrspace(1) %gep.1
1028 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
1029 store half %r3, ptr addrspace(1) %gep.out
1033 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1034 ; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1035 ; VI-FLUSH: ; %bb.0:
1036 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1037 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1038 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1039 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1040 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1041 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1042 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1043 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1044 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
1045 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1046 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1047 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1048 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 2.0, v4
1049 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1050 ; VI-FLUSH-NEXT: s_endpgm
1052 ; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1053 ; VI-DENORM: ; %bb.0:
1054 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1055 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1056 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1057 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
1058 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1059 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1060 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1061 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1062 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
1063 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1064 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
1065 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1066 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, v2
1067 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
1068 ; VI-DENORM-NEXT: s_endpgm
1070 ; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1071 ; GFX10-FLUSH: ; %bb.0:
1072 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1073 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1074 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1075 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1076 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1077 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1078 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1079 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1080 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
1081 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
1082 ; GFX10-FLUSH-NEXT: s_endpgm
1084 ; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1085 ; GFX10-DENORM: ; %bb.0:
1086 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1087 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1088 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1089 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1090 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1091 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1092 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
1094 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
1095 ; GFX10-DENORM-NEXT: s_endpgm
1097 ; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1098 ; GFX11-FLUSH: ; %bb.0:
1099 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1100 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1101 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1102 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1103 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1104 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1105 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1106 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1107 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1108 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1109 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1110 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
1111 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1112 ; GFX11-FLUSH-NEXT: s_nop 0
1113 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1114 ; GFX11-FLUSH-NEXT: s_endpgm
1116 ; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
1117 ; GFX11-DENORM: ; %bb.0:
1118 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1119 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1120 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
1121 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1122 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1123 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1124 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1125 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1126 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1127 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
1128 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
1129 ; GFX11-DENORM-NEXT: s_nop 0
1130 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1131 ; GFX11-DENORM-NEXT: s_endpgm
1132 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1133 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1134 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1135 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1137 %r1 = load volatile half, ptr addrspace(1) %gep.0
1138 %r2 = load volatile half, ptr addrspace(1) %gep.1
1140 %r1.fneg = fneg half %r1
1142 %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
1143 store half %r3, ptr addrspace(1) %gep.out
1147 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1148 ; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1149 ; VI-FLUSH: ; %bb.0:
1150 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1151 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1152 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1153 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1154 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1155 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1156 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1157 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1158 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
1159 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1160 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1161 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1162 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4
1163 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1164 ; VI-FLUSH-NEXT: s_endpgm
1166 ; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1167 ; VI-DENORM: ; %bb.0:
1168 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1169 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1170 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1171 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
1172 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1173 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1174 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1175 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1176 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
1177 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1178 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
1179 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1180 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, -2.0, v2
1181 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
1182 ; VI-DENORM-NEXT: s_endpgm
1184 ; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1185 ; GFX10-FLUSH: ; %bb.0:
1186 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1187 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1188 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1189 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1190 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1191 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1192 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1193 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1194 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
1195 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
1196 ; GFX10-FLUSH-NEXT: s_endpgm
1198 ; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1199 ; GFX10-DENORM: ; %bb.0:
1200 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1201 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1202 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1203 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1204 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1205 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1206 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
1208 ; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
1209 ; GFX10-DENORM-NEXT: s_endpgm
1211 ; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
1212 ; GFX11-FLUSH: ; %bb.0:
1213 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1214 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1215 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1216 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1217 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1218 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1219 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1220 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1221 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1222 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1223 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1224 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
1225 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1226 ; GFX11-FLUSH-NEXT: s_nop 0
1227 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1228 ; GFX11-FLUSH-NEXT: s_endpgm
1230 ; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
1231 ; GFX11-DENORM: ; %bb.0:
1232 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1233 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1234 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
1235 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1236 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1237 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1238 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1239 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1240 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1241 ; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
1242 ; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
1243 ; GFX11-DENORM-NEXT: s_nop 0
1244 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1245 ; GFX11-DENORM-NEXT: s_endpgm
1246 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1247 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1248 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1249 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1251 %r1 = load volatile half, ptr addrspace(1) %gep.0
1252 %r2 = load volatile half, ptr addrspace(1) %gep.1
1254 %r1.fneg = fneg half %r1
1256 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
1257 store half %r3, ptr addrspace(1) %gep.out
1261 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1262 ; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1263 ; VI-FLUSH: ; %bb.0:
1264 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1265 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1266 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1267 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1268 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1269 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1270 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1271 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1272 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
1273 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1274 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1275 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1276 ; VI-FLUSH-NEXT: v_mad_f16 v2, v4, 2.0, -v2
1277 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1278 ; VI-FLUSH-NEXT: s_endpgm
1280 ; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1281 ; VI-DENORM: ; %bb.0:
1282 ; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1283 ; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1284 ; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1285 ; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
1286 ; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1287 ; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288 ; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1289 ; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1290 ; VI-DENORM-NEXT: flat_load_ushort v4, v[0:1] glc
1291 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1292 ; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3] glc
1293 ; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
1294 ; VI-DENORM-NEXT: v_fma_f16 v2, v4, 2.0, -v2
1295 ; VI-DENORM-NEXT: flat_store_short v[0:1], v2
1296 ; VI-DENORM-NEXT: s_endpgm
1298 ; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1299 ; GFX10-FLUSH: ; %bb.0:
1300 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1301 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1302 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1304 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1305 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1306 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1307 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1308 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
1309 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
1310 ; GFX10-FLUSH-NEXT: s_endpgm
1312 ; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1313 ; GFX10-DENORM: ; %bb.0:
1314 ; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1315 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1316 ; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1317 ; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
1318 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1319 ; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
1320 ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
1321 ; GFX10-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
1322 ; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
1323 ; GFX10-DENORM-NEXT: s_endpgm
1325 ; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
1326 ; GFX11-FLUSH: ; %bb.0:
1327 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1328 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1329 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1330 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1331 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1332 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1333 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1334 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1335 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1336 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
1337 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1338 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
1339 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1340 ; GFX11-FLUSH-NEXT: s_nop 0
1341 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1342 ; GFX11-FLUSH-NEXT: s_endpgm
1344 ; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
1345 ; GFX11-DENORM: ; %bb.0:
1346 ; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1347 ; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1348 ; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
1349 ; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1350 ; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
1351 ; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
1352 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1353 ; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
1354 ; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
1355 ; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
1356 ; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1]
1357 ; GFX11-DENORM-NEXT: s_nop 0
1358 ; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1359 ; GFX11-DENORM-NEXT: s_endpgm
1360 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1361 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
1362 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
1363 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
1365 %r1 = load volatile half, ptr addrspace(1) %gep.0
1366 %r2 = load volatile half, ptr addrspace(1) %gep.1
1368 %r2.fneg = fneg half %r2
1370 %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
1371 store half %r3, ptr addrspace(1) %gep.out
1375 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1376 ; VI-FLUSH-LABEL: mad_sub_f16:
1377 ; VI-FLUSH: ; %bb.0:
1378 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1379 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1380 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1381 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
1382 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1383 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1384 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1385 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1386 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1387 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1388 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
1389 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1390 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1391 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1392 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
1393 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1394 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1395 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1396 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1397 ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -v3
1398 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1399 ; VI-FLUSH-NEXT: s_endpgm
1401 ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
1402 ; VI-DENORM-CONTRACT: ; %bb.0:
1403 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1404 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1405 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1406 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
1407 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1408 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1409 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1410 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1411 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1412 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1413 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
1414 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1415 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
1416 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1417 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
1418 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1419 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
1420 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1421 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1422 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -v3
1423 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
1424 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
1426 ; GFX10-FLUSH-LABEL: mad_sub_f16:
1427 ; GFX10-FLUSH: ; %bb.0:
1428 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1429 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1430 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1431 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1432 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1433 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1434 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1435 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1436 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1437 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1438 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
1439 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
1440 ; GFX10-FLUSH-NEXT: s_endpgm
1442 ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
1443 ; GFX10-DENORM-STRICT: ; %bb.0:
1444 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1445 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1446 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1447 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1448 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1449 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1450 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1451 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1452 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1453 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1454 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
1455 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
1456 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
1458 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
1459 ; GFX10-DENORM-CONTRACT: ; %bb.0:
1460 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1461 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1462 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1463 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1464 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1465 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1466 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1467 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1468 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1469 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3
1470 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5]
1471 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
1473 ; GFX11-FLUSH-LABEL: mad_sub_f16:
1474 ; GFX11-FLUSH: ; %bb.0:
1475 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1476 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1477 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1478 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1479 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1480 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1481 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1482 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1483 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1484 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1485 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1486 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1487 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1488 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
1489 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1490 ; GFX11-FLUSH-NEXT: s_nop 0
1491 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1492 ; GFX11-FLUSH-NEXT: s_endpgm
1494 ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
1495 ; GFX11-DENORM-STRICT: ; %bb.0:
1496 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1497 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1498 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1499 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1500 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1501 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1502 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1503 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1504 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1505 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1506 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1507 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1508 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1509 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
1510 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
1511 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
1512 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1513 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
1515 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
1516 ; GFX11-DENORM-CONTRACT: ; %bb.0:
1517 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1518 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1519 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1520 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1521 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1522 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1523 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1524 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1525 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1526 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1527 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1528 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3
1529 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
1530 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
1531 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1532 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
1533 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1534 %tid.ext = sext i32 %tid to i64
1535 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1536 %add1 = add i64 %tid.ext, 1
1537 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1538 %add2 = add i64 %tid.ext, 2
1539 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1540 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1541 %a = load volatile half, ptr addrspace(1) %gep0, align 2
1542 %b = load volatile half, ptr addrspace(1) %gep1, align 2
1543 %c = load volatile half, ptr addrspace(1) %gep2, align 2
1544 %mul = fmul half %a, %b
1545 %sub = fsub half %mul, %c
1546 store half %sub, ptr addrspace(1) %outgep, align 2
1550 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1551 ; VI-FLUSH-LABEL: mad_sub_inv_f16:
1552 ; VI-FLUSH: ; %bb.0:
1553 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1554 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1555 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1556 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
1557 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1558 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1559 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1560 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1561 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1562 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1563 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
1564 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1565 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1566 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1567 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
1568 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1569 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1570 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1571 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1572 ; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, v3
1573 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1574 ; VI-FLUSH-NEXT: s_endpgm
1576 ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1577 ; VI-DENORM-CONTRACT: ; %bb.0:
1578 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1579 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1580 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1581 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
1582 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1583 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1584 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1585 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1586 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1587 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1588 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
1589 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1590 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
1591 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1592 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
1593 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1594 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
1595 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1596 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1597 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, v3
1598 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
1599 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
1601 ; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
1602 ; GFX10-FLUSH: ; %bb.0:
1603 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1604 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1605 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1606 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1607 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1608 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1609 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1610 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1611 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1612 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1613 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1
1614 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
1615 ; GFX10-FLUSH-NEXT: s_endpgm
1617 ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1618 ; GFX10-DENORM-STRICT: ; %bb.0:
1619 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1620 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1621 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1622 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1623 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1624 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1625 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1626 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1627 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1628 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1629 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1
1630 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
1631 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
1633 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1634 ; GFX10-DENORM-CONTRACT: ; %bb.0:
1635 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1636 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1637 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1638 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1639 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1640 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1641 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1642 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1643 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1644 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3
1645 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5]
1646 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
1648 ; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
1649 ; GFX11-FLUSH: ; %bb.0:
1650 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1651 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1652 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1653 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1654 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1655 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1656 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1657 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1658 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1659 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1660 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1661 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1662 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1663 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1
1664 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1665 ; GFX11-FLUSH-NEXT: s_nop 0
1666 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1667 ; GFX11-FLUSH-NEXT: s_endpgm
1669 ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
1670 ; GFX11-DENORM-STRICT: ; %bb.0:
1671 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1672 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1673 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1674 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1675 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1676 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1677 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1678 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1679 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1680 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1681 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1682 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1683 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1684 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1
1685 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
1686 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
1687 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1688 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
1690 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
1691 ; GFX11-DENORM-CONTRACT: ; %bb.0:
1692 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1693 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1694 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1695 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1696 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1697 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1698 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1699 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1700 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1701 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1702 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3
1704 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
1705 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
1706 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1707 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
1708 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1709 %tid.ext = sext i32 %tid to i64
1710 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1711 %add1 = add i64 %tid.ext, 1
1712 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1713 %add2 = add i64 %tid.ext, 2
1714 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1715 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1716 %a = load volatile half, ptr addrspace(1) %gep0, align 2
1717 %b = load volatile half, ptr addrspace(1) %gep1, align 2
1718 %c = load volatile half, ptr addrspace(1) %gep2, align 2
1719 %mul = fmul half %a, %b
1720 %sub = fsub half %c, %mul
1721 store half %sub, ptr addrspace(1) %outgep, align 2
1725 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1726 ; VI-FLUSH-LABEL: mad_sub_fabs_f16:
1727 ; VI-FLUSH: ; %bb.0:
1728 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1729 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1730 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1731 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
1732 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1733 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1734 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1735 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1736 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1737 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1738 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
1739 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1740 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1741 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1742 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
1743 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1744 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1745 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1746 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1747 ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -|v3|
1748 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1749 ; VI-FLUSH-NEXT: s_endpgm
1751 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1752 ; VI-DENORM-CONTRACT: ; %bb.0:
1753 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1754 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1755 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1756 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
1757 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1758 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1759 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1760 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1761 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1762 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1763 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
1764 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1765 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
1766 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1767 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
1768 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1769 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
1770 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1771 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1772 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -|v3|
1773 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
1774 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
1776 ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
1777 ; GFX10-FLUSH: ; %bb.0:
1778 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1779 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1780 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1781 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1782 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1783 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1784 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1785 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1786 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1787 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1788 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3|
1789 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
1790 ; GFX10-FLUSH-NEXT: s_endpgm
1792 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1793 ; GFX10-DENORM-STRICT: ; %bb.0:
1794 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1795 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1796 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1797 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1798 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1799 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1800 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1801 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1802 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1803 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1804 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3|
1805 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
1806 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
1808 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1809 ; GFX10-DENORM-CONTRACT: ; %bb.0:
1810 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1811 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1812 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1813 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1814 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1815 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1816 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1817 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1818 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1819 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3|
1820 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5]
1821 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
1823 ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
1824 ; GFX11-FLUSH: ; %bb.0:
1825 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1826 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1827 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1828 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1829 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1830 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1831 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1832 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1833 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1834 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1835 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
1836 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1837 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
1838 ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3|
1839 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
1840 ; GFX11-FLUSH-NEXT: s_nop 0
1841 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1842 ; GFX11-FLUSH-NEXT: s_endpgm
1844 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
1845 ; GFX11-DENORM-STRICT: ; %bb.0:
1846 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1847 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1848 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1849 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1850 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1851 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1852 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1853 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1854 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1855 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1856 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1857 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1858 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1859 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3|
1860 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
1861 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
1862 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1863 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
1865 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
1866 ; GFX11-DENORM-CONTRACT: ; %bb.0:
1867 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
1868 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1869 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
1870 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1871 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1872 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
1873 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1874 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
1875 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1876 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
1877 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1878 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3|
1879 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
1880 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
1881 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1882 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
1883 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
1884 %tid.ext = sext i32 %tid to i64
1885 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
1886 %add1 = add i64 %tid.ext, 1
1887 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
1888 %add2 = add i64 %tid.ext, 2
1889 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
1890 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
1891 %a = load volatile half, ptr addrspace(1) %gep0, align 2
1892 %b = load volatile half, ptr addrspace(1) %gep1, align 2
1893 %c = load volatile half, ptr addrspace(1) %gep2, align 2
1894 %c.abs = call half @llvm.fabs.f16(half %c) #0
1895 %mul = fmul half %a, %b
1896 %sub = fsub half %mul, %c.abs
1897 store half %sub, ptr addrspace(1) %outgep, align 2
1901 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
1902 ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1903 ; VI-FLUSH: ; %bb.0:
1904 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1905 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1906 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1907 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
1908 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1909 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1910 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1911 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1912 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1913 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1914 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
1915 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1916 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
1917 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1918 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
1919 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
1920 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
1921 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1922 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1923 ; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, |v3|
1924 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
1925 ; VI-FLUSH-NEXT: s_endpgm
1927 ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1928 ; VI-DENORM-CONTRACT: ; %bb.0:
1929 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
1930 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
1931 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1932 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
1933 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1934 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1935 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
1936 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1937 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
1938 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
1939 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
1940 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1941 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
1942 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1943 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
1944 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1945 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
1946 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
1947 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1948 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, |v3|
1949 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
1950 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
1952 ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
1953 ; GFX10-FLUSH: ; %bb.0:
1954 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1955 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1956 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
1957 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1958 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1959 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1960 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1961 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1962 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
1963 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
1964 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1
1965 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
1966 ; GFX10-FLUSH-NEXT: s_endpgm
1968 ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
1969 ; GFX10-DENORM-STRICT: ; %bb.0:
1970 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1971 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1972 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
1973 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1974 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1975 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1976 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1977 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1978 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
1979 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
1980 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1
1981 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
1982 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
1984 ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
1985 ; GFX10-DENORM-CONTRACT: ; %bb.0:
1986 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
1987 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1988 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
1989 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
1990 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1991 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
1992 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1993 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
1994 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
1995 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3|
1996 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5]
1997 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
1999 ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
2000 ; GFX11-FLUSH: ; %bb.0:
2001 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2002 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2003 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2004 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2005 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2006 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2007 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2008 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2009 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2010 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2011 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2012 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
2013 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2014 ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1
2015 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
2016 ; GFX11-FLUSH-NEXT: s_nop 0
2017 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2018 ; GFX11-FLUSH-NEXT: s_endpgm
2020 ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
2021 ; GFX11-DENORM-STRICT: ; %bb.0:
2022 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2023 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2024 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2025 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2026 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2027 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2028 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2029 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2030 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2031 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2032 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2033 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
2034 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2035 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1
2036 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
2037 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
2038 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2039 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
2041 ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
2042 ; GFX11-DENORM-CONTRACT: ; %bb.0:
2043 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2044 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2045 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2046 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2047 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2048 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2049 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2050 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2051 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2052 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2053 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2054 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3|
2055 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
2056 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
2057 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2058 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
2059 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2060 %tid.ext = sext i32 %tid to i64
2061 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2062 %add1 = add i64 %tid.ext, 1
2063 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2064 %add2 = add i64 %tid.ext, 2
2065 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2066 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2067 %a = load volatile half, ptr addrspace(1) %gep0, align 2
2068 %b = load volatile half, ptr addrspace(1) %gep1, align 2
2069 %c = load volatile half, ptr addrspace(1) %gep2, align 2
2070 %c.abs = call half @llvm.fabs.f16(half %c) #0
2071 %mul = fmul half %a, %b
2072 %sub = fsub half %c.abs, %mul
2073 store half %sub, ptr addrspace(1) %outgep, align 2
2077 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2078 ; VI-FLUSH-LABEL: neg_neg_mad_f16:
2079 ; VI-FLUSH: ; %bb.0:
2080 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2081 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
2082 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2083 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
2084 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
2085 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2086 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2087 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2088 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2089 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2090 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
2091 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2092 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
2093 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2094 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
2095 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2096 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
2097 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
2098 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2099 ; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v7, v2
2100 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
2101 ; VI-FLUSH-NEXT: s_endpgm
2103 ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2104 ; VI-DENORM-CONTRACT: ; %bb.0:
2105 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2106 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
2107 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2108 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
2109 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
2110 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2111 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2112 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2113 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2114 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2115 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
2116 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2117 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
2118 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2119 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
2120 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2121 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
2122 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
2123 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2124 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, v3
2125 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
2126 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
2128 ; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
2129 ; GFX10-FLUSH: ; %bb.0:
2130 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2131 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2132 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2133 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2134 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2135 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2136 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2137 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2138 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2139 ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
2140 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1
2141 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
2142 ; GFX10-FLUSH-NEXT: s_endpgm
2144 ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2145 ; GFX10-DENORM-STRICT: ; %bb.0:
2146 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2147 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2148 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2149 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2150 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2151 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2152 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2153 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2154 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2155 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
2156 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1
2157 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
2158 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
2160 ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2161 ; GFX10-DENORM-CONTRACT: ; %bb.0:
2162 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2163 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2164 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2165 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2166 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2167 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2168 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2169 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2170 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
2172 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
2173 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
2175 ; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
2176 ; GFX11-FLUSH: ; %bb.0:
2177 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2178 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2179 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2180 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2181 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2183 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2184 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2185 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2186 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2187 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2188 ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
2189 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2190 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1
2191 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
2192 ; GFX11-FLUSH-NEXT: s_nop 0
2193 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2194 ; GFX11-FLUSH-NEXT: s_endpgm
2196 ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
2197 ; GFX11-DENORM-STRICT: ; %bb.0:
2198 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2199 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2200 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2201 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2202 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2203 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2204 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2205 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2206 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2207 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2208 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2209 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
2210 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2211 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1
2212 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
2213 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
2214 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2215 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
2217 ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
2218 ; GFX11-DENORM-CONTRACT: ; %bb.0:
2219 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2220 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2221 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2222 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2223 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2224 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2225 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2226 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2227 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2228 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2229 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2230 ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
2231 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
2232 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
2233 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2234 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
2235 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2236 %tid.ext = sext i32 %tid to i64
2237 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2238 %add1 = add i64 %tid.ext, 1
2239 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2240 %add2 = add i64 %tid.ext, 2
2241 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2242 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2243 %a = load volatile half, ptr addrspace(1) %gep0, align 2
2244 %b = load volatile half, ptr addrspace(1) %gep1, align 2
2245 %c = load volatile half, ptr addrspace(1) %gep2, align 2
2246 %nega = fneg half %a
2247 %negb = fneg half %b
2248 %mul = fmul half %nega, %negb
2249 %sub = fadd half %mul, %c
2250 store half %sub, ptr addrspace(1) %outgep, align 2
2254 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
2255 ; VI-FLUSH-LABEL: mad_fabs_sub_f16:
2256 ; VI-FLUSH: ; %bb.0:
2257 ; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2258 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0
2259 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2260 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
2261 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6
2262 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2263 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2264 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2265 ; VI-FLUSH-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2266 ; VI-FLUSH-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2267 ; VI-FLUSH-NEXT: flat_load_ushort v7, v[0:1] glc
2268 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2269 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
2270 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2271 ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc
2272 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2273 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
2274 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6
2275 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2276 ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, |v2|, -v3
2277 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
2278 ; VI-FLUSH-NEXT: s_endpgm
2280 ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2281 ; VI-DENORM-CONTRACT: ; %bb.0:
2282 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
2283 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0
2284 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2285 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
2286 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6
2287 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2288 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2289 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2290 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v4, vcc, 4, v0
2291 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
2292 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v7, v[0:1] glc
2293 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2294 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
2295 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2296 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc
2297 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2298 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
2299 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6
2300 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2301 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, |v2|, -v3
2302 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
2303 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
2305 ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
2306 ; GFX10-FLUSH: ; %bb.0:
2307 ; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2308 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2309 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2310 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2311 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2312 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2313 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2314 ; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2315 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2316 ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2|
2317 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
2318 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
2319 ; GFX10-FLUSH-NEXT: s_endpgm
2321 ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2322 ; GFX10-DENORM-STRICT: ; %bb.0:
2323 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2324 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2325 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2326 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2327 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2328 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2329 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2330 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2331 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2332 ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2|
2333 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
2334 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
2335 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
2337 ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2338 ; GFX10-DENORM-CONTRACT: ; %bb.0:
2339 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
2340 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2341 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2342 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
2343 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2344 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc
2345 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2346 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc
2347 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2348 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3
2349 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5]
2350 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
2352 ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
2353 ; GFX11-FLUSH: ; %bb.0:
2354 ; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2355 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2356 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2357 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2358 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2359 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2360 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2361 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2362 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2363 ; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2364 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2365 ; GFX11-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2|
2366 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2367 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
2368 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
2369 ; GFX11-FLUSH-NEXT: s_nop 0
2370 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2371 ; GFX11-FLUSH-NEXT: s_endpgm
2373 ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
2374 ; GFX11-DENORM-STRICT: ; %bb.0:
2375 ; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2376 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2377 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2378 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2379 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2380 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2381 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2382 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2383 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2384 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2385 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2386 ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2|
2387 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2388 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
2389 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
2390 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
2391 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2392 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
2394 ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
2395 ; GFX11-DENORM-CONTRACT: ; %bb.0:
2396 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
2397 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2398 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2399 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2400 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2401 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
2402 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2403 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
2404 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2405 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
2406 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2407 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3
2408 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
2409 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
2410 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2411 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
2412 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
2413 %tid.ext = sext i32 %tid to i64
2414 %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
2415 %add1 = add i64 %tid.ext, 1
2416 %gep1 = getelementptr half, ptr addrspace(1) %ptr, i64 %add1
2417 %add2 = add i64 %tid.ext, 2
2418 %gep2 = getelementptr half, ptr addrspace(1) %ptr, i64 %add2
2419 %outgep = getelementptr half, ptr addrspace(1) %out, i64 %tid.ext
2420 %a = load volatile half, ptr addrspace(1) %gep0, align 2
2421 %b = load volatile half, ptr addrspace(1) %gep1, align 2
2422 %c = load volatile half, ptr addrspace(1) %gep2, align 2
2423 %b.abs = call half @llvm.fabs.f16(half %b) #0
2424 %mul = fmul half %a, %b.abs
2425 %sub = fsub half %mul, %c
2426 store half %sub, ptr addrspace(1) %outgep, align 2
2430 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2431 ; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2432 ; VI-FLUSH: ; %bb.0:
2433 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2434 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2435 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2436 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
2437 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2438 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2439 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2440 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2441 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
2442 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2443 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
2444 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2445 ; VI-FLUSH-NEXT: v_mac_f16_e32 v2, -2.0, v4
2446 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
2447 ; VI-FLUSH-NEXT: s_endpgm
2449 ; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2450 ; VI-DENORM-CONTRACT: ; %bb.0:
2451 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2452 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2453 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2454 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
2455 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2456 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2457 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2458 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2459 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc
2460 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2461 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
2462 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2463 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, -2.0, v2
2464 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
2465 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
2467 ; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2468 ; GFX10-FLUSH: ; %bb.0:
2469 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2470 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2471 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2472 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2473 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2474 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2475 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2476 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
2477 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
2478 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
2479 ; GFX10-FLUSH-NEXT: s_endpgm
2481 ; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2482 ; GFX10-DENORM-STRICT: ; %bb.0:
2483 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2484 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2485 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2486 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2487 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2488 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2489 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2490 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
2491 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
2492 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
2493 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
2495 ; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2496 ; GFX10-DENORM-CONTRACT: ; %bb.0:
2497 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2498 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2499 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2500 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2501 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2502 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2503 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2504 ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
2505 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
2506 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
2508 ; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
2509 ; GFX11-FLUSH: ; %bb.0:
2510 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2511 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2512 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2513 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2514 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2515 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2516 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2517 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2518 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2519 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
2520 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2521 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
2522 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
2523 ; GFX11-FLUSH-NEXT: s_nop 0
2524 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2525 ; GFX11-FLUSH-NEXT: s_endpgm
2527 ; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
2528 ; GFX11-DENORM-STRICT: ; %bb.0:
2529 ; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2530 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2531 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2532 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2533 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2534 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2535 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2536 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2537 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2538 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
2539 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2540 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
2541 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
2542 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
2543 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2544 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
2546 ; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
2547 ; GFX11-DENORM-CONTRACT: ; %bb.0:
2548 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2549 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2550 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2551 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2552 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2553 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2554 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2555 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2556 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
2558 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
2559 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
2560 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2561 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
2562 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2563 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2564 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2565 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2567 %r1 = load volatile half, ptr addrspace(1) %gep.0
2568 %r2 = load volatile half, ptr addrspace(1) %gep.1
2570 %add = fadd half %r1, %r1
2571 %r3 = fsub half %r2, %add
2573 store half %r3, ptr addrspace(1) %gep.out
2577 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2578 ; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2579 ; VI-FLUSH: ; %bb.0:
2580 ; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2581 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2582 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2583 ; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
2584 ; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2585 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2586 ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2587 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2588 ; VI-FLUSH-NEXT: flat_load_ushort v4, v[0:1] glc
2589 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2590 ; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc
2591 ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
2592 ; VI-FLUSH-NEXT: v_mad_f16 v2, v4, 2.0, -v2
2593 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2
2594 ; VI-FLUSH-NEXT: s_endpgm
2596 ; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2597 ; VI-DENORM-CONTRACT: ; %bb.0:
2598 ; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2599 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2600 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2601 ; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
2602 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2603 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2604 ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
2605 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
2606 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v4, v[0:1] glc
2607 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2608 ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3] glc
2609 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2610 ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v4, 2.0, -v2
2611 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
2612 ; VI-DENORM-CONTRACT-NEXT: s_endpgm
2614 ; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2615 ; GFX10-FLUSH: ; %bb.0:
2616 ; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2617 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2618 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2619 ; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2620 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2621 ; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2622 ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
2623 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
2624 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
2625 ; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
2626 ; GFX10-FLUSH-NEXT: s_endpgm
2628 ; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2629 ; GFX10-DENORM-STRICT: ; %bb.0:
2630 ; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2631 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2632 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2633 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2634 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2635 ; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2636 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2637 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
2638 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
2639 ; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
2640 ; GFX10-DENORM-STRICT-NEXT: s_endpgm
2642 ; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2643 ; GFX10-DENORM-CONTRACT: ; %bb.0:
2644 ; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2645 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2646 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2647 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
2648 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2649 ; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
2650 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2651 ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
2652 ; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
2653 ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
2655 ; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
2656 ; GFX11-FLUSH: ; %bb.0:
2657 ; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2658 ; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2659 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2660 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2661 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
2662 ; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2663 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2664 ; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2665 ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
2666 ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
2667 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
2668 ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
2669 ; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
2670 ; GFX11-FLUSH-NEXT: s_nop 0
2671 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2672 ; GFX11-FLUSH-NEXT: s_endpgm
2674 ; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
2675 ; GFX11-DENORM-STRICT: ; %bb.0:
2676 ; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2677 ; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2678 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2679 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2680 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
2681 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2682 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2683 ; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2684 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
2685 ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
2686 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2687 ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
2688 ; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
2689 ; GFX11-DENORM-STRICT-NEXT: s_nop 0
2690 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2691 ; GFX11-DENORM-STRICT-NEXT: s_endpgm
2693 ; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
2694 ; GFX11-DENORM-CONTRACT: ; %bb.0:
2695 ; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2696 ; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2697 ; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
2698 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
2699 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
2700 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
2701 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2702 ; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
2703 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
2704 ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
2705 ; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
2706 ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
2707 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2708 ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
2709 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2710 %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
2711 %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
2712 %gep.out = getelementptr half, ptr addrspace(1) %out, i32 %tid
2714 %r1 = load volatile half, ptr addrspace(1) %gep.0
2715 %r2 = load volatile half, ptr addrspace(1) %gep.1
2717 %add = fadd half %r1, %r1
2718 %r3 = fsub half %add, %r2
2720 store half %r3, ptr addrspace(1) %gep.out
2724 attributes #0 = { nounwind }
2725 attributes #1 = { nounwind readnone }