1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-SAFE %s
3 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=SI,SI-NSZ %s
5 ; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
6 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
9 ; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
11 ; --------------------------------------------------------------------------------
13 ; --------------------------------------------------------------------------------
15 define half @v_fneg_add_f16(half %a, half %b) #0 {
16 ; SI-SAFE-LABEL: v_fneg_add_f16:
18 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
20 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
21 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
22 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
23 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
24 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
25 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
27 ; SI-NSZ-LABEL: v_fneg_add_f16:
29 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
31 ; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v0, -v0
32 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
33 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
34 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
35 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
37 ; VI-SAFE-LABEL: v_fneg_add_f16:
39 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40 ; VI-SAFE-NEXT: v_add_f16_e32 v0, v0, v1
41 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
42 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
44 ; VI-NSZ-LABEL: v_fneg_add_f16:
46 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47 ; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
48 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
50 ; GFX11-SAFE-LABEL: v_fneg_add_f16:
51 ; GFX11-SAFE: ; %bb.0:
52 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
53 ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v0, v1
54 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
55 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
56 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
58 ; GFX11-NSZ-LABEL: v_fneg_add_f16:
60 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61 ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
62 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
63 %add = fadd half %a, %b
64 %fneg = fneg half %add
68 define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 {
69 ; SI-LABEL: v_fneg_add_store_use_add_f16:
71 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
73 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
74 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
75 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
76 ; SI-NEXT: v_add_f32_e32 v1, v0, v1
77 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
78 ; SI-NEXT: s_setpc_b64 s[30:31]
80 ; VI-LABEL: v_fneg_add_store_use_add_f16:
82 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83 ; VI-NEXT: v_add_f16_e32 v1, v0, v1
84 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
85 ; VI-NEXT: s_setpc_b64 s[30:31]
87 ; GFX11-LABEL: v_fneg_add_store_use_add_f16:
89 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX11-NEXT: v_add_f16_e32 v1, v0, v1
91 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
92 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
93 ; GFX11-NEXT: s_setpc_b64 s[30:31]
94 %add = fadd half %a, %b
95 %fneg = fneg half %add
96 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
97 %insert.1 = insertvalue { half, half } %insert.0, half %add, 1
98 ret { half, half } %insert.1
101 define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
102 ; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
104 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
105 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
106 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
107 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
108 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
109 ; SI-SAFE-NEXT: v_add_f32_e32 v1, v0, v1
110 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
111 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1
112 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
114 ; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
116 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
117 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
118 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
119 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
120 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0
121 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
122 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
123 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
125 ; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
127 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128 ; VI-SAFE-NEXT: v_add_f16_e32 v1, v0, v1
129 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1
130 ; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1
131 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
133 ; VI-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
135 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
136 ; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
137 ; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
138 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
140 ; GFX11-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
141 ; GFX11-SAFE: ; %bb.0:
142 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
143 ; GFX11-SAFE-NEXT: v_add_f16_e32 v1, v0, v1
144 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
145 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1
146 ; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1
147 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
149 ; GFX11-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
150 ; GFX11-NSZ: ; %bb.0:
151 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
153 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
154 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
155 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
156 %add = fadd half %a, %b
157 %fneg = fneg half %add
158 %use1 = fmul half %add, 4.0
160 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
161 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
162 ret { half, half } %insert.1
165 define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 {
166 ; SI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
168 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
170 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
171 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
172 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
173 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0
174 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
175 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
177 ; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16:
179 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
181 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
182 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
183 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
184 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
185 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
187 ; VI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
189 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; VI-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0
191 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
192 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
194 ; VI-NSZ-LABEL: v_fneg_add_fneg_x_f16:
196 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
197 ; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1
198 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
200 ; GFX11-SAFE-LABEL: v_fneg_add_fneg_x_f16:
201 ; GFX11-SAFE: ; %bb.0:
202 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203 ; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0
204 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
205 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
206 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
208 ; GFX11-NSZ-LABEL: v_fneg_add_fneg_x_f16:
209 ; GFX11-NSZ: ; %bb.0:
210 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211 ; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1
212 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
213 %fneg.a = fneg half %a
214 %add = fadd half %fneg.a, %b
215 %fneg = fneg half %add
219 define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 {
220 ; SI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
222 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
223 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
224 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
225 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
226 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
227 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1
228 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
229 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
231 ; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16:
233 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
235 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
236 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
237 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
238 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v1, v0
239 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
241 ; VI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
243 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
244 ; VI-SAFE-NEXT: v_sub_f16_e32 v0, v0, v1
245 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
246 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
248 ; VI-NSZ-LABEL: v_fneg_add_x_fneg_f16:
250 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251 ; VI-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0
252 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
254 ; GFX11-SAFE-LABEL: v_fneg_add_x_fneg_f16:
255 ; GFX11-SAFE: ; %bb.0:
256 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
257 ; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v0, v1
258 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
259 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
260 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
262 ; GFX11-NSZ-LABEL: v_fneg_add_x_fneg_f16:
263 ; GFX11-NSZ: ; %bb.0:
264 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0
266 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
267 %fneg.b = fneg half %b
268 %add = fadd half %a, %fneg.b
269 %fneg = fneg half %add
273 define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 {
274 ; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
276 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
278 ; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
279 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
280 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
281 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1
282 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
283 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
285 ; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
287 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
289 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
290 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
291 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
292 ; SI-NSZ-NEXT: v_add_f32_e32 v0, v0, v1
293 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
295 ; VI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
297 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; VI-SAFE-NEXT: v_sub_f16_e64 v0, -v0, v1
299 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
300 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
302 ; VI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
304 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; VI-NSZ-NEXT: v_add_f16_e32 v0, v0, v1
306 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
308 ; GFX11-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
309 ; GFX11-SAFE: ; %bb.0:
310 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311 ; GFX11-SAFE-NEXT: v_sub_f16_e64 v0, -v0, v1
312 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
313 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
314 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
316 ; GFX11-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
317 ; GFX11-NSZ: ; %bb.0:
318 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319 ; GFX11-NSZ-NEXT: v_add_f16_e32 v0, v0, v1
320 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
321 %fneg.a = fneg half %a
322 %fneg.b = fneg half %b
323 %add = fadd half %fneg.a, %fneg.b
324 %fneg = fneg half %add
328 define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
329 ; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
331 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
333 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
334 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0
335 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1
336 ; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0
337 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v3, v2
338 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
339 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
341 ; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
343 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
345 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
346 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v1
347 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0
348 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0
349 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v2
350 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
352 ; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
354 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
355 ; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
356 ; VI-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0
357 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
358 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, v2
359 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
361 ; VI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
363 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364 ; VI-NSZ-NEXT: v_xor_b32_e32 v2, 0x8000, v0
365 ; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1
366 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, v2
367 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
369 ; GFX11-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
370 ; GFX11-SAFE: ; %bb.0:
371 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372 ; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0
373 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
374 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
375 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
376 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
377 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
379 ; GFX11-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
380 ; GFX11-NSZ: ; %bb.0:
381 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
382 ; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
383 ; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
384 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
385 ; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
386 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
387 %fneg.a = fneg half %a
388 %add = fadd half %fneg.a, %b
389 %fneg = fneg half %add
390 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
391 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1
392 ret { half, half } %insert.1
395 define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 {
396 ; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
398 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
400 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
401 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
402 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0
403 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
404 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
405 ; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0
406 ; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v3
407 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
408 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v2
409 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
411 ; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
413 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
415 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
416 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
417 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
418 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
419 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0
420 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0
421 ; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1
422 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2
423 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
425 ; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
427 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428 ; VI-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0
429 ; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v1
430 ; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v2
431 ; VI-SAFE-NEXT: v_mov_b32_e32 v0, v3
432 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
434 ; VI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
436 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437 ; VI-NSZ-NEXT: v_sub_f16_e32 v3, v0, v1
438 ; VI-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v2
439 ; VI-NSZ-NEXT: v_mov_b32_e32 v0, v3
440 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
442 ; GFX11-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
443 ; GFX11-SAFE: ; %bb.0:
444 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445 ; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0
446 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
447 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v1
448 ; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v2
449 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v3
450 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
452 ; GFX11-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
453 ; GFX11-NSZ: ; %bb.0:
454 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455 ; GFX11-NSZ-NEXT: v_sub_f16_e32 v3, v0, v1
456 ; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v2
457 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
458 ; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v3
459 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
460 %fneg.a = fneg half %a
461 %add = fadd half %fneg.a, %b
462 %fneg = fneg half %add
463 %use1 = fmul half %fneg.a, %c
465 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
466 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
467 ret { half, half } %insert.1
470 ; This one asserted with -enable-no-signed-zeros-fp-math
471 define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 {
472 ; SI-SAFE-LABEL: fneg_fadd_0_f16:
473 ; SI-SAFE: ; %bb.0: ; %.entry
474 ; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
475 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1
476 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0
477 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
478 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
479 ; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
480 ; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2
481 ; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
482 ; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
483 ; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
484 ; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3
485 ; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3
486 ; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4
487 ; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5
488 ; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4
489 ; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
490 ; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
491 ; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
492 ; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
493 ; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
494 ; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
495 ; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
496 ; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
497 ; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
498 ; SI-SAFE-NEXT: ; return to shader part epilog
500 ; SI-NSZ-LABEL: fneg_fadd_0_f16:
501 ; SI-NSZ: ; %bb.0: ; %.entry
502 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
503 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
504 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
505 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
506 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
507 ; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
508 ; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2
509 ; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
510 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
511 ; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0
512 ; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3
513 ; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3
514 ; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4
515 ; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5
516 ; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4
517 ; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
518 ; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5
519 ; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
520 ; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
521 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
522 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
523 ; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
524 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
525 ; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
526 ; SI-NSZ-NEXT: ; return to shader part epilog
528 ; VI-SAFE-LABEL: fneg_fadd_0_f16:
529 ; VI-SAFE: ; %bb.0: ; %.entry
530 ; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
531 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
532 ; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
533 ; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
534 ; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
535 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
536 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
537 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
538 ; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
539 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
540 ; VI-SAFE-NEXT: ; return to shader part epilog
542 ; VI-NSZ-LABEL: fneg_fadd_0_f16:
543 ; VI-NSZ: ; %bb.0: ; %.entry
544 ; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
545 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
546 ; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
547 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
548 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
549 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
550 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
551 ; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
552 ; VI-NSZ-NEXT: ; return to shader part epilog
554 ; GFX11-SAFE-LABEL: fneg_fadd_0_f16:
555 ; GFX11-SAFE: ; %bb.0: ; %.entry
556 ; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
557 ; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
558 ; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
559 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
560 ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
561 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
562 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
563 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
564 ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
565 ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
566 ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
567 ; GFX11-SAFE-NEXT: ; return to shader part epilog
569 ; GFX11-NSZ-LABEL: fneg_fadd_0_f16:
570 ; GFX11-NSZ: ; %bb.0: ; %.entry
571 ; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
572 ; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
573 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
574 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
575 ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
576 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
577 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
578 ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
579 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
580 ; GFX11-NSZ-NEXT: ; return to shader part epilog
582 %tmp7 = fdiv half 1.000000e+00, %tmp6
583 %tmp8 = fmul half 0.000000e+00, %tmp7
584 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8
585 %.i188 = fadd half %tmp9, 0.000000e+00
586 %tmp10 = fcmp uge half %.i188, %tmp2
587 %tmp11 = fneg half %.i188
588 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11
589 %tmp12 = fcmp ule half %.i092, 0.000000e+00
590 %.i198 = select i1 %tmp12, half 0.000000e+00, half 0x7FF8000000000000
594 ; This is a workaround because -enable-no-signed-zeros-fp-math does not set up
595 ; function attribute unsafe-fp-math automatically. Combine with the previous test
597 define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
598 ; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
599 ; SI-SAFE: ; %bb.0: ; %.entry
600 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0
601 ; SI-SAFE-NEXT: s_brev_b32 s0, 1
602 ; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
603 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
604 ; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0
605 ; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0
606 ; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
607 ; SI-SAFE-NEXT: ; return to shader part epilog
609 ; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
610 ; SI-NSZ: ; %bb.0: ; %.entry
611 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
612 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
613 ; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000
614 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
615 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
616 ; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0
617 ; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
618 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
619 ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
620 ; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
621 ; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
622 ; SI-NSZ-NEXT: ; return to shader part epilog
624 ; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16:
625 ; VI-SAFE: ; %bb.0: ; %.entry
626 ; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000
627 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
628 ; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0
629 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
630 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
631 ; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
632 ; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
633 ; VI-SAFE-NEXT: ; return to shader part epilog
635 ; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16:
636 ; VI-NSZ: ; %bb.0: ; %.entry
637 ; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
638 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
639 ; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
640 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
641 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
642 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
643 ; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
644 ; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
645 ; VI-NSZ-NEXT: ; return to shader part epilog
647 ; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
648 ; GFX11-SAFE: ; %bb.0: ; %.entry
649 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0
650 ; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0
651 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
652 ; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
653 ; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
654 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3)
655 ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
656 ; GFX11-SAFE-NEXT: ; return to shader part epilog
658 ; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
659 ; GFX11-NSZ: ; %bb.0: ; %.entry
660 ; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
661 ; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
662 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
663 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
664 ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
665 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
666 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
667 ; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
668 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
669 ; GFX11-NSZ-NEXT: ; return to shader part epilog
671 %tmp7 = fdiv afn half 1.000000e+00, %tmp6
672 %tmp8 = fmul half 0.000000e+00, %tmp7
673 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8
674 %.i188 = fadd half %tmp9, 0.000000e+00
675 %tmp10 = fcmp uge half %.i188, %tmp2
676 %tmp11 = fneg half %.i188
677 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11
678 %tmp12 = fcmp ule half %.i092, 0.000000e+00
679 %.i198 = select i1 %tmp12, half 0.000000e+00, half 0x7FF8000000000000
683 ; --------------------------------------------------------------------------------
685 ; --------------------------------------------------------------------------------
687 define half @v_fneg_mul_f16(half %a, half %b) #0 {
688 ; SI-LABEL: v_fneg_mul_f16:
690 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
692 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
693 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
694 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
695 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
696 ; SI-NEXT: s_setpc_b64 s[30:31]
698 ; VI-LABEL: v_fneg_mul_f16:
700 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
701 ; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
702 ; VI-NEXT: s_setpc_b64 s[30:31]
704 ; GFX11-LABEL: v_fneg_mul_f16:
706 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
707 ; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
708 ; GFX11-NEXT: s_setpc_b64 s[30:31]
709 %mul = fmul half %a, %b
710 %fneg = fneg half %mul
714 define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 {
715 ; SI-LABEL: v_fneg_mul_store_use_mul_f16:
717 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
719 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
720 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
721 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
722 ; SI-NEXT: v_mul_f32_e32 v1, v0, v1
723 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
724 ; SI-NEXT: s_setpc_b64 s[30:31]
726 ; VI-LABEL: v_fneg_mul_store_use_mul_f16:
728 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
729 ; VI-NEXT: v_mul_f16_e32 v1, v0, v1
730 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
731 ; VI-NEXT: s_setpc_b64 s[30:31]
733 ; GFX11-LABEL: v_fneg_mul_store_use_mul_f16:
735 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
736 ; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
737 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
738 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
739 ; GFX11-NEXT: s_setpc_b64 s[30:31]
740 %mul = fmul half %a, %b
741 %fneg = fneg half %mul
742 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
743 %insert.1 = insertvalue { half, half } %insert.0, half %mul, 1
744 ret { half, half } %insert.1
747 define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 {
748 ; SI-LABEL: v_fneg_mul_multi_use_mul_f16:
750 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
752 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
753 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
754 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
755 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
756 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
757 ; SI-NEXT: s_setpc_b64 s[30:31]
759 ; VI-LABEL: v_fneg_mul_multi_use_mul_f16:
761 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762 ; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
763 ; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
764 ; VI-NEXT: s_setpc_b64 s[30:31]
766 ; GFX11-LABEL: v_fneg_mul_multi_use_mul_f16:
768 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769 ; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
770 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
771 ; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
772 ; GFX11-NEXT: s_setpc_b64 s[30:31]
773 %mul = fmul half %a, %b
774 %fneg = fneg half %mul
775 %use1 = fmul half %mul, 4.0
776 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
777 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
778 ret { half, half } %insert.1
781 define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 {
782 ; SI-LABEL: v_fneg_mul_fneg_x_f16:
784 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
785 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
786 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
787 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
788 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
789 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
790 ; SI-NEXT: s_setpc_b64 s[30:31]
792 ; VI-LABEL: v_fneg_mul_fneg_x_f16:
794 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
795 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1
796 ; VI-NEXT: s_setpc_b64 s[30:31]
798 ; GFX11-LABEL: v_fneg_mul_fneg_x_f16:
800 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
802 ; GFX11-NEXT: s_setpc_b64 s[30:31]
803 %fneg.a = fneg half %a
804 %mul = fmul half %fneg.a, %b
805 %fneg = fneg half %mul
809 define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 {
810 ; SI-LABEL: v_fneg_mul_x_fneg_f16:
812 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
813 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
814 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
815 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
816 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
817 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
818 ; SI-NEXT: s_setpc_b64 s[30:31]
820 ; VI-LABEL: v_fneg_mul_x_fneg_f16:
822 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1
824 ; VI-NEXT: s_setpc_b64 s[30:31]
826 ; GFX11-LABEL: v_fneg_mul_x_fneg_f16:
828 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
829 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
830 ; GFX11-NEXT: s_setpc_b64 s[30:31]
831 %fneg.b = fneg half %b
832 %mul = fmul half %a, %fneg.b
833 %fneg = fneg half %mul
837 define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 {
838 ; SI-LABEL: v_fneg_mul_fneg_fneg_f16:
840 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
842 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
843 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
844 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
845 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
846 ; SI-NEXT: s_setpc_b64 s[30:31]
848 ; VI-LABEL: v_fneg_mul_fneg_fneg_f16:
850 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
852 ; VI-NEXT: s_setpc_b64 s[30:31]
854 ; GFX11-LABEL: v_fneg_mul_fneg_fneg_f16:
856 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857 ; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
858 ; GFX11-NEXT: s_setpc_b64 s[30:31]
859 %fneg.a = fneg half %a
860 %fneg.b = fneg half %b
861 %mul = fmul half %fneg.a, %fneg.b
862 %fneg = fneg half %mul
866 define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 {
867 ; SI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
869 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
871 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
872 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
873 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
874 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0
875 ; SI-NEXT: v_mul_f32_e32 v0, v3, v2
876 ; SI-NEXT: s_setpc_b64 s[30:31]
878 ; VI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
880 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881 ; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0
882 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1
883 ; VI-NEXT: v_mov_b32_e32 v1, v2
884 ; VI-NEXT: s_setpc_b64 s[30:31]
886 ; GFX11-LABEL: v_fneg_mul_store_use_fneg_x_f16:
888 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889 ; GFX11-NEXT: v_mul_f16_e32 v2, v0, v1
890 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
891 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
892 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
893 ; GFX11-NEXT: s_setpc_b64 s[30:31]
894 %fneg.a = fneg half %a
895 %mul = fmul half %fneg.a, %b
896 %fneg = fneg half %mul
897 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
898 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1
899 ret { half, half } %insert.1
902 define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c) #0 {
903 ; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
905 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
907 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
908 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
909 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
910 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
911 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
912 ; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0
913 ; SI-NEXT: v_mul_f32_e32 v0, v3, v1
914 ; SI-NEXT: v_mul_f32_e32 v1, v4, v2
915 ; SI-NEXT: s_setpc_b64 s[30:31]
917 ; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
919 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
920 ; VI-NEXT: v_mul_f16_e32 v3, v0, v1
921 ; VI-NEXT: v_mul_f16_e64 v1, -v0, v2
922 ; VI-NEXT: v_mov_b32_e32 v0, v3
923 ; VI-NEXT: s_setpc_b64 s[30:31]
925 ; GFX11-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
927 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928 ; GFX11-NEXT: v_mul_f16_e32 v3, v0, v1
929 ; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2
930 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
931 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
932 ; GFX11-NEXT: s_setpc_b64 s[30:31]
933 %fneg.a = fneg half %a
934 %mul = fmul half %fneg.a, %b
935 %fneg = fneg half %mul
936 %use1 = fmul half %fneg.a, %c
937 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
938 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
939 ret { half, half } %insert.1
942 ; --------------------------------------------------------------------------------
944 ; --------------------------------------------------------------------------------
946 define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
947 ; SI-LABEL: v_fneg_minnum_f16_ieee:
949 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
950 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
951 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
952 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
953 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
954 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
955 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
956 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
957 ; SI-NEXT: s_setpc_b64 s[30:31]
959 ; VI-LABEL: v_fneg_minnum_f16_ieee:
961 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962 ; VI-NEXT: v_max_f16_e64 v1, -v1, -v1
963 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
964 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
965 ; VI-NEXT: s_setpc_b64 s[30:31]
967 ; GFX11-LABEL: v_fneg_minnum_f16_ieee:
969 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
970 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
971 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
972 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
973 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
974 ; GFX11-NEXT: s_setpc_b64 s[30:31]
975 %min = call half @llvm.minnum.f16(half %a, half %b)
976 %fneg = fneg half %min
980 define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 {
981 ; SI-LABEL: v_fneg_minnum_f16_no_ieee:
983 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
985 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
986 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
987 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
988 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
989 ; SI-NEXT: s_setpc_b64 s[30:31]
991 ; VI-LABEL: v_fneg_minnum_f16_no_ieee:
993 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v1
995 ; VI-NEXT: s_setpc_b64 s[30:31]
997 ; GFX11-LABEL: v_fneg_minnum_f16_no_ieee:
999 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v1
1001 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1002 %min = call half @llvm.minnum.f16(half %a, half %b)
1003 %fneg = fneg half %min
1007 define half @v_fneg_self_minnum_f16_ieee(half %a) #0 {
1008 ; SI-LABEL: v_fneg_self_minnum_f16_ieee:
1010 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1012 ; SI-NEXT: s_setpc_b64 s[30:31]
1014 ; VI-LABEL: v_fneg_self_minnum_f16_ieee:
1016 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1017 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1018 ; VI-NEXT: s_setpc_b64 s[30:31]
1020 ; GFX11-LABEL: v_fneg_self_minnum_f16_ieee:
1022 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1024 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1025 %min = call half @llvm.minnum.f16(half %a, half %a)
1026 %min.fneg = fneg half %min
1030 define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 {
1031 ; SI-LABEL: v_fneg_self_minnum_f16_no_ieee:
1033 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1035 ; SI-NEXT: s_setpc_b64 s[30:31]
1037 ; VI-LABEL: v_fneg_self_minnum_f16_no_ieee:
1039 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1041 ; VI-NEXT: s_setpc_b64 s[30:31]
1043 ; GFX11-LABEL: v_fneg_self_minnum_f16_no_ieee:
1045 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1047 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1048 %min = call half @llvm.minnum.f16(half %a, half %a)
1049 %min.fneg = fneg half %min
1053 define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
1054 ; SI-LABEL: v_fneg_posk_minnum_f16_ieee:
1056 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1058 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1059 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1060 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
1061 ; SI-NEXT: s_setpc_b64 s[30:31]
1063 ; VI-LABEL: v_fneg_posk_minnum_f16_ieee:
1065 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1066 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1067 ; VI-NEXT: v_max_f16_e32 v0, -4.0, v0
1068 ; VI-NEXT: s_setpc_b64 s[30:31]
1070 ; GFX11-LABEL: v_fneg_posk_minnum_f16_ieee:
1072 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1073 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1074 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1075 ; GFX11-NEXT: v_max_f16_e32 v0, -4.0, v0
1076 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1077 %min = call half @llvm.minnum.f16(half 4.0, half %a)
1078 %fneg = fneg half %min
1082 define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 {
1083 ; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
1085 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1086 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1087 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1088 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
1089 ; SI-NEXT: s_setpc_b64 s[30:31]
1091 ; VI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
1093 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1094 ; VI-NEXT: v_max_f16_e64 v0, -v0, -4.0
1095 ; VI-NEXT: s_setpc_b64 s[30:31]
1097 ; GFX11-LABEL: v_fneg_posk_minnum_f16_no_ieee:
1099 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1100 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -4.0
1101 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1102 %min = call half @llvm.minnum.f16(half 4.0, half %a)
1103 %fneg = fneg half %min
1107 define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
1108 ; SI-LABEL: v_fneg_negk_minnum_f16_ieee:
1110 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1111 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1112 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1113 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1114 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
1115 ; SI-NEXT: s_setpc_b64 s[30:31]
1117 ; VI-LABEL: v_fneg_negk_minnum_f16_ieee:
1119 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1120 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1121 ; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
1122 ; VI-NEXT: s_setpc_b64 s[30:31]
1124 ; GFX11-LABEL: v_fneg_negk_minnum_f16_ieee:
1126 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1128 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1129 ; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
1130 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1131 %min = call half @llvm.minnum.f16(half -4.0, half %a)
1132 %fneg = fneg half %min
1136 define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 {
1137 ; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
1139 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1141 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1142 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
1143 ; SI-NEXT: s_setpc_b64 s[30:31]
1145 ; VI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
1147 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1148 ; VI-NEXT: v_max_f16_e64 v0, -v0, 4.0
1149 ; VI-NEXT: s_setpc_b64 s[30:31]
1151 ; GFX11-LABEL: v_fneg_negk_minnum_f16_no_ieee:
1153 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1154 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, 4.0
1155 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1156 %min = call half @llvm.minnum.f16(half -4.0, half %a)
1157 %fneg = fneg half %min
1161 define half @v_fneg_0_minnum_f16(half %a) #0 {
1162 ; SI-LABEL: v_fneg_0_minnum_f16:
1164 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1165 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1166 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1167 ; SI-NEXT: v_min_f32_e32 v0, 0, v0
1168 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1169 ; SI-NEXT: s_setpc_b64 s[30:31]
1171 ; VI-LABEL: v_fneg_0_minnum_f16:
1173 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1174 ; VI-NEXT: v_min_f16_e32 v0, 0, v0
1175 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1176 ; VI-NEXT: s_setpc_b64 s[30:31]
1178 ; GFX11-LABEL: v_fneg_0_minnum_f16:
1180 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1181 ; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
1182 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1183 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1184 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1185 %min = call nnan half @llvm.minnum.f16(half 0.0, half %a)
1186 %fneg = fneg half %min
1190 define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
1191 ; SI-LABEL: v_fneg_neg0_minnum_f16_ieee:
1193 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1194 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1195 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1196 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1197 ; SI-NEXT: v_max_f32_e32 v0, 0, v0
1198 ; SI-NEXT: s_setpc_b64 s[30:31]
1200 ; VI-LABEL: v_fneg_neg0_minnum_f16_ieee:
1202 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1203 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1204 ; VI-NEXT: v_max_f16_e32 v0, 0, v0
1205 ; VI-NEXT: s_setpc_b64 s[30:31]
1207 ; GFX11-LABEL: v_fneg_neg0_minnum_f16_ieee:
1209 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1211 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1212 ; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
1213 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1214 %min = call half @llvm.minnum.f16(half -0.0, half %a)
1215 %fneg = fneg half %min
1219 define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
1220 ; SI-LABEL: v_fneg_inv2pi_minnum_f16:
1222 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1223 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1224 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1225 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1226 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
1227 ; SI-NEXT: s_setpc_b64 s[30:31]
1229 ; VI-LABEL: v_fneg_inv2pi_minnum_f16:
1231 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1232 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1233 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1234 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1235 ; VI-NEXT: s_setpc_b64 s[30:31]
1237 ; GFX11-LABEL: v_fneg_inv2pi_minnum_f16:
1239 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1241 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1242 ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1243 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1244 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1245 %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
1246 %fneg = fneg half %min
1250 define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
1251 ; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
1253 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1254 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1255 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1256 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1257 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
1258 ; SI-NEXT: s_setpc_b64 s[30:31]
1260 ; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
1262 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1263 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1264 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1265 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1266 ; VI-NEXT: s_setpc_b64 s[30:31]
1268 ; GFX11-LABEL: v_fneg_neg_inv2pi_minnum_f16:
1270 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1272 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1273 ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1274 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1275 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1276 %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
1277 %fneg = fneg half %min
1281 define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 {
1282 ; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
1284 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1285 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1286 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1287 ; SI-NEXT: v_max_f32_e32 v0, 0, v0
1288 ; SI-NEXT: s_setpc_b64 s[30:31]
1290 ; VI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
1292 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1293 ; VI-NEXT: v_max_f16_e64 v0, -v0, 0
1294 ; VI-NEXT: s_setpc_b64 s[30:31]
1296 ; GFX11-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
1298 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1299 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, 0
1300 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1301 %min = call half @llvm.minnum.f16(half -0.0, half %a)
1302 %fneg = fneg half %min
1306 define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
1307 ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
1309 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1310 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1311 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1312 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1313 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1314 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1315 ; SI-NEXT: v_min_f32_e32 v0, 0, v0
1316 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
1317 ; SI-NEXT: s_setpc_b64 s[30:31]
1319 ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
1321 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1322 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1323 ; VI-NEXT: v_min_f16_e32 v0, 0, v0
1324 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
1325 ; VI-NEXT: s_setpc_b64 s[30:31]
1327 ; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
1329 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1330 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1332 ; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
1333 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
1334 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1335 %min = call half @llvm.minnum.f16(half 0.0, half %a)
1336 %fneg = fneg half %min
1337 %mul = fmul half %fneg, %b
1341 define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
1342 ; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
1344 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1345 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1346 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1347 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1348 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1349 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1350 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
1351 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
1352 ; SI-NEXT: s_setpc_b64 s[30:31]
1354 ; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
1356 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1357 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1358 ; VI-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1359 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
1360 ; VI-NEXT: s_setpc_b64 s[30:31]
1362 ; GFX11-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
1364 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1366 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1367 ; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
1368 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
1369 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1370 %min = call half @llvm.minnum.f16(half 0xH3118, half %a)
1371 %fneg = fneg half %min
1372 %mul = fmul half %fneg, %b
1376 define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
1377 ; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
1379 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1380 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1381 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1382 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1383 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1384 ; SI-NEXT: v_min_f32_e32 v0, 0, v0
1385 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
1386 ; SI-NEXT: s_setpc_b64 s[30:31]
1388 ; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
1390 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1391 ; VI-NEXT: v_min_f16_e32 v0, 0, v0
1392 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
1393 ; VI-NEXT: s_setpc_b64 s[30:31]
1395 ; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
1397 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1398 ; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
1399 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1400 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
1401 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1402 %min = call half @llvm.minnum.f16(half 0.0, half %a)
1403 %fneg = fneg half %min
1404 %mul = fmul half %fneg, %b
1408 define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) #0 {
1409 ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
1411 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1412 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1413 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1414 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
1415 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1416 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1417 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1418 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
1419 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
1420 ; SI-NEXT: s_setpc_b64 s[30:31]
1422 ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
1424 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1425 ; VI-NEXT: v_max_f16_e64 v1, -v1, -v1
1426 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1427 ; VI-NEXT: v_max_f16_e32 v0, v0, v1
1428 ; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
1429 ; VI-NEXT: s_setpc_b64 s[30:31]
1431 ; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
1433 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1434 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
1435 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1436 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1437 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
1438 ; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
1439 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1440 %min = call half @llvm.minnum.f16(half %a, half %b)
1441 %fneg = fneg half %min
1442 %use1 = fmul half %min, 4.0
1443 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
1444 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
1445 ret { half, half } %insert.1
1448 define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b) #4 {
1449 ; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
1451 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1452 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1453 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1454 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
1455 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1456 ; SI-NEXT: v_max_f32_e32 v0, v0, v1
1457 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
1458 ; SI-NEXT: s_setpc_b64 s[30:31]
1460 ; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
1462 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1463 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v1
1464 ; VI-NEXT: v_mov_b32_e32 v1, 0xc400
1465 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1466 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1467 ; VI-NEXT: s_setpc_b64 s[30:31]
1469 ; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
1471 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1472 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
1473 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1474 ; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0
1475 ; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1
1476 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1477 %min = call half @llvm.minnum.f16(half %a, half %b)
1478 %fneg = fneg half %min
1479 %use1 = fmul half %min, 4.0
1480 %ins0 = insertelement <2 x half> undef, half %fneg, i32 0
1481 %ins1 = insertelement <2 x half> %ins0, half %use1, i32 1
1482 ret <2 x half> %ins1
1485 ; --------------------------------------------------------------------------------
1487 ; --------------------------------------------------------------------------------
1489 define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
1490 ; SI-LABEL: v_fneg_maxnum_f16_ieee:
1492 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1493 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
1494 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1495 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1496 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1497 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1498 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1499 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
1500 ; SI-NEXT: s_setpc_b64 s[30:31]
1502 ; VI-LABEL: v_fneg_maxnum_f16_ieee:
1504 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1505 ; VI-NEXT: v_max_f16_e64 v1, -v1, -v1
1506 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1507 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
1508 ; VI-NEXT: s_setpc_b64 s[30:31]
1510 ; GFX11-LABEL: v_fneg_maxnum_f16_ieee:
1512 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1513 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
1514 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1515 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1516 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
1517 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1518 %max = call half @llvm.maxnum.f16(half %a, half %b)
1519 %fneg = fneg half %max
1523 define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 {
1524 ; SI-LABEL: v_fneg_maxnum_f16_no_ieee:
1526 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1527 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
1528 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1529 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1530 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1531 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
1532 ; SI-NEXT: s_setpc_b64 s[30:31]
1534 ; VI-LABEL: v_fneg_maxnum_f16_no_ieee:
1536 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537 ; VI-NEXT: v_min_f16_e64 v0, -v0, -v1
1538 ; VI-NEXT: s_setpc_b64 s[30:31]
1540 ; GFX11-LABEL: v_fneg_maxnum_f16_no_ieee:
1542 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1543 ; GFX11-NEXT: v_min_f16_e64 v0, -v0, -v1
1544 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1545 %max = call half @llvm.maxnum.f16(half %a, half %b)
1546 %fneg = fneg half %max
1550 define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 {
1551 ; SI-LABEL: v_fneg_self_maxnum_f16_ieee:
1553 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1554 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1555 ; SI-NEXT: s_setpc_b64 s[30:31]
1557 ; VI-LABEL: v_fneg_self_maxnum_f16_ieee:
1559 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1560 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1561 ; VI-NEXT: s_setpc_b64 s[30:31]
1563 ; GFX11-LABEL: v_fneg_self_maxnum_f16_ieee:
1565 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1566 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1567 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1568 %max = call half @llvm.maxnum.f16(half %a, half %a)
1569 %max.fneg = fneg half %max
1573 define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 {
1574 ; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
1576 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1577 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1578 ; SI-NEXT: s_setpc_b64 s[30:31]
1580 ; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
1582 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1583 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1584 ; VI-NEXT: s_setpc_b64 s[30:31]
1586 ; GFX11-LABEL: v_fneg_self_maxnum_f16_no_ieee:
1588 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1589 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1590 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1591 %max = call half @llvm.maxnum.f16(half %a, half %a)
1592 %max.fneg = fneg half %max
1596 define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
1597 ; SI-LABEL: v_fneg_posk_maxnum_f16_ieee:
1599 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1600 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1601 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1602 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1603 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
1604 ; SI-NEXT: s_setpc_b64 s[30:31]
1606 ; VI-LABEL: v_fneg_posk_maxnum_f16_ieee:
1608 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1609 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1610 ; VI-NEXT: v_min_f16_e32 v0, -4.0, v0
1611 ; VI-NEXT: s_setpc_b64 s[30:31]
1613 ; GFX11-LABEL: v_fneg_posk_maxnum_f16_ieee:
1615 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1616 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1617 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1618 ; GFX11-NEXT: v_min_f16_e32 v0, -4.0, v0
1619 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1620 %max = call half @llvm.maxnum.f16(half 4.0, half %a)
1621 %fneg = fneg half %max
1625 define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 {
1626 ; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
1628 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1629 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1630 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1631 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
1632 ; SI-NEXT: s_setpc_b64 s[30:31]
1634 ; VI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
1636 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1637 ; VI-NEXT: v_min_f16_e64 v0, -v0, -4.0
1638 ; VI-NEXT: s_setpc_b64 s[30:31]
1640 ; GFX11-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
1642 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1643 ; GFX11-NEXT: v_min_f16_e64 v0, -v0, -4.0
1644 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1645 %max = call half @llvm.maxnum.f16(half 4.0, half %a)
1646 %fneg = fneg half %max
1650 define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
1651 ; SI-LABEL: v_fneg_negk_maxnum_f16_ieee:
1653 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1654 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1655 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1656 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1657 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
1658 ; SI-NEXT: s_setpc_b64 s[30:31]
1660 ; VI-LABEL: v_fneg_negk_maxnum_f16_ieee:
1662 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1663 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1664 ; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
1665 ; VI-NEXT: s_setpc_b64 s[30:31]
1667 ; GFX11-LABEL: v_fneg_negk_maxnum_f16_ieee:
1669 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1670 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1671 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1672 ; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
1673 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1674 %max = call half @llvm.maxnum.f16(half -4.0, half %a)
1675 %fneg = fneg half %max
1679 define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 {
1680 ; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
1682 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1683 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1684 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1685 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
1686 ; SI-NEXT: s_setpc_b64 s[30:31]
1688 ; VI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
1690 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1691 ; VI-NEXT: v_min_f16_e64 v0, -v0, 4.0
1692 ; VI-NEXT: s_setpc_b64 s[30:31]
1694 ; GFX11-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
1696 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1697 ; GFX11-NEXT: v_min_f16_e64 v0, -v0, 4.0
1698 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1699 %max = call half @llvm.maxnum.f16(half -4.0, half %a)
1700 %fneg = fneg half %max
1704 define half @v_fneg_0_maxnum_f16(half %a) #0 {
1705 ; SI-LABEL: v_fneg_0_maxnum_f16:
1707 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1708 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1709 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1710 ; SI-NEXT: v_max_f32_e32 v0, 0, v0
1711 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1712 ; SI-NEXT: s_setpc_b64 s[30:31]
1714 ; VI-LABEL: v_fneg_0_maxnum_f16:
1716 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1717 ; VI-NEXT: v_max_f16_e32 v0, 0, v0
1718 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1719 ; VI-NEXT: s_setpc_b64 s[30:31]
1721 ; GFX11-LABEL: v_fneg_0_maxnum_f16:
1723 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724 ; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
1725 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1726 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1727 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1728 %max = call nnan half @llvm.maxnum.f16(half 0.0, half %a)
1729 %fneg = fneg half %max
1733 define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
1734 ; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
1736 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1737 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1738 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1739 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1740 ; SI-NEXT: v_min_f32_e32 v0, 0, v0
1741 ; SI-NEXT: s_setpc_b64 s[30:31]
1743 ; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
1745 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1746 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1747 ; VI-NEXT: v_min_f16_e32 v0, 0, v0
1748 ; VI-NEXT: s_setpc_b64 s[30:31]
1750 ; GFX11-LABEL: v_fneg_neg0_maxnum_f16_ieee:
1752 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1753 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1754 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1755 ; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
1756 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1757 %max = call half @llvm.maxnum.f16(half -0.0, half %a)
1758 %fneg = fneg half %max
1762 define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 {
1763 ; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
1765 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1766 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
1767 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1768 ; SI-NEXT: v_min_f32_e32 v0, 0, v0
1769 ; SI-NEXT: s_setpc_b64 s[30:31]
1771 ; VI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
1773 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1774 ; VI-NEXT: v_min_f16_e64 v0, -v0, 0
1775 ; VI-NEXT: s_setpc_b64 s[30:31]
1777 ; GFX11-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
1779 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1780 ; GFX11-NEXT: v_min_f16_e64 v0, -v0, 0
1781 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1782 %max = call half @llvm.maxnum.f16(half -0.0, half %a)
1783 %fneg = fneg half %max
1787 define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
1788 ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
1790 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1791 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1792 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1793 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1794 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1795 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1796 ; SI-NEXT: v_max_f32_e32 v0, 0, v0
1797 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
1798 ; SI-NEXT: s_setpc_b64 s[30:31]
1800 ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
1802 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1803 ; VI-NEXT: v_max_f16_e32 v0, v0, v0
1804 ; VI-NEXT: v_max_f16_e32 v0, 0, v0
1805 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
1806 ; VI-NEXT: s_setpc_b64 s[30:31]
1808 ; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
1810 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1811 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1812 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1813 ; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
1814 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
1815 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1816 %max = call half @llvm.maxnum.f16(half 0.0, half %a)
1817 %fneg = fneg half %max
1818 %mul = fmul half %fneg, %b
1822 define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
1823 ; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
1825 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1826 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1827 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1828 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1829 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1830 ; SI-NEXT: v_max_f32_e32 v0, 0, v0
1831 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
1832 ; SI-NEXT: s_setpc_b64 s[30:31]
1834 ; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
1836 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1837 ; VI-NEXT: v_max_f16_e32 v0, 0, v0
1838 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
1839 ; VI-NEXT: s_setpc_b64 s[30:31]
1841 ; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
1843 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1844 ; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
1845 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1846 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
1847 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1848 %max = call half @llvm.maxnum.f16(half 0.0, half %a)
1849 %fneg = fneg half %max
1850 %mul = fmul half %fneg, %b
1854 define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) #0 {
1855 ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
1857 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1858 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1859 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1860 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
1861 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1862 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
1863 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
1864 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
1865 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
1866 ; SI-NEXT: s_setpc_b64 s[30:31]
1868 ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
1870 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1871 ; VI-NEXT: v_max_f16_e64 v1, -v1, -v1
1872 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
1873 ; VI-NEXT: v_min_f16_e32 v0, v0, v1
1874 ; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
1875 ; VI-NEXT: s_setpc_b64 s[30:31]
1877 ; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
1879 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1880 ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
1881 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
1882 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1883 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
1884 ; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
1885 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1886 %max = call half @llvm.maxnum.f16(half %a, half %b)
1887 %fneg = fneg half %max
1888 %use1 = fmul half %max, 4.0
1889 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
1890 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
1891 ret { half, half } %insert.1
1894 define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b) #4 {
1895 ; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
1897 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1898 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1899 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1900 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
1901 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1902 ; SI-NEXT: v_min_f32_e32 v0, v0, v1
1903 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
1904 ; SI-NEXT: s_setpc_b64 s[30:31]
1906 ; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
1908 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1909 ; VI-NEXT: v_min_f16_e64 v0, -v0, -v1
1910 ; VI-NEXT: v_mov_b32_e32 v1, 0xc400
1911 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1912 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1913 ; VI-NEXT: s_setpc_b64 s[30:31]
1915 ; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
1917 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1918 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
1919 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1920 ; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0
1921 ; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1
1922 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1923 %max = call half @llvm.maxnum.f16(half %a, half %b)
1924 %fneg = fneg half %max
1925 %use1 = fmul half %max, 4.0
1926 %ins0 = insertelement <2 x half> undef, half %fneg, i32 0
1927 %ins1 = insertelement <2 x half> %ins0, half %use1, i32 1
1928 ret <2 x half> %ins1
1931 ; --------------------------------------------------------------------------------
1933 ; --------------------------------------------------------------------------------
1935 define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
1936 ; SI-SAFE-LABEL: v_fneg_fma_f16:
1938 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
1940 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
1941 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
1942 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
1943 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
1944 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
1945 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2
1946 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
1947 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
1949 ; SI-NSZ-LABEL: v_fneg_fma_f16:
1951 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1952 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
1953 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
1954 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
1955 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
1956 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
1957 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
1958 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
1959 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
1961 ; VI-SAFE-LABEL: v_fneg_fma_f16:
1963 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1964 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2
1965 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1966 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
1968 ; VI-NSZ-LABEL: v_fneg_fma_f16:
1970 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1971 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
1972 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
1974 ; GFX11-SAFE-LABEL: v_fneg_fma_f16:
1975 ; GFX11-SAFE: ; %bb.0:
1976 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1977 ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
1978 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
1979 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
1980 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
1982 ; GFX11-NSZ-LABEL: v_fneg_fma_f16:
1983 ; GFX11-NSZ: ; %bb.0:
1984 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1985 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
1986 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
1987 %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
1988 %fneg = fneg half %fma
1992 define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #0 {
1993 ; SI-LABEL: v_fneg_fma_store_use_fma_f16:
1995 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1996 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1997 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1998 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1999 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
2000 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
2001 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
2002 ; SI-NEXT: v_fma_f32 v1, v0, v1, v2
2003 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
2004 ; SI-NEXT: s_setpc_b64 s[30:31]
2006 ; VI-LABEL: v_fneg_fma_store_use_fma_f16:
2008 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2009 ; VI-NEXT: v_fma_f16 v1, v0, v1, v2
2010 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
2011 ; VI-NEXT: s_setpc_b64 s[30:31]
2013 ; GFX11-LABEL: v_fneg_fma_store_use_fma_f16:
2015 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2016 ; GFX11-NEXT: v_fma_f16 v1, v0, v1, v2
2017 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2018 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
2019 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2020 %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
2021 %fneg = fneg half %fma
2022 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
2023 %insert.1 = insertvalue { half, half } %insert.0, half %fma, 1
2024 ret { half, half } %insert.1
2027 define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #0 {
2028 ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
2030 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2031 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2032 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2033 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2034 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2035 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2036 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2037 ; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2
2038 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
2039 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1
2040 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2042 ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
2044 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2045 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2046 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2047 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2048 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2049 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2050 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2051 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
2052 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
2053 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2055 ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
2057 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2058 ; VI-SAFE-NEXT: v_fma_f16 v1, v0, v1, v2
2059 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1
2060 ; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1
2061 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2063 ; VI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
2065 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2066 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2067 ; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
2068 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2070 ; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
2071 ; GFX11-SAFE: ; %bb.0:
2072 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073 ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
2074 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2075 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
2076 ; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2
2077 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2079 ; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
2080 ; GFX11-NSZ: ; %bb.0:
2081 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2082 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2083 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
2084 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
2085 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2086 %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
2087 %fneg = fneg half %fma
2088 %use1 = fmul half %fma, 4.0
2089 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
2090 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
2091 ret { half, half } %insert.1
2094 define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
2095 ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
2097 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2098 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2099 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2100 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2101 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2102 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2103 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2104 ; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2
2105 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2106 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2108 ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
2110 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2111 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2112 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2113 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2114 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2115 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2116 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2117 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2
2118 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2120 ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
2122 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2123 ; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2
2124 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2125 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2127 ; VI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
2129 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2130 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
2131 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2133 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
2134 ; GFX11-SAFE: ; %bb.0:
2135 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2136 ; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2
2137 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2138 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2139 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2141 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
2142 ; GFX11-NSZ: ; %bb.0:
2143 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2144 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
2145 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2146 %fneg.a = fneg half %a
2147 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
2148 %fneg = fneg half %fma
2152 define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
2153 ; SI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
2155 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2156 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2157 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2158 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2159 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2160 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2161 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2162 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2
2163 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2164 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2166 ; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
2168 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2169 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2170 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2171 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2172 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2173 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2174 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2175 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2
2176 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2178 ; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
2180 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2181 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, -v1, v2
2182 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2183 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2185 ; VI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
2187 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2188 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
2189 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2191 ; GFX11-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
2192 ; GFX11-SAFE: ; %bb.0:
2193 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2194 ; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, -v1, v2
2195 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2196 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2197 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2199 ; GFX11-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
2200 ; GFX11-NSZ: ; %bb.0:
2201 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2202 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
2203 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2204 %fneg.b = fneg half %b
2205 %fma = call half @llvm.fma.f16(half %a, half %fneg.b, half %c)
2206 %fneg = fneg half %fma
2210 define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
2211 ; SI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2213 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2214 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2215 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2216 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2217 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2218 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2219 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2220 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2
2221 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2222 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2224 ; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2226 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2227 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2228 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2229 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2230 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2231 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2232 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2233 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
2234 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2236 ; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2238 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2239 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2
2240 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2241 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2243 ; VI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2245 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2246 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2247 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2249 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2250 ; GFX11-SAFE: ; %bb.0:
2251 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2252 ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
2253 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2254 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
2255 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2257 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
2258 ; GFX11-NSZ: ; %bb.0:
2259 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2260 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2261 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2262 %fneg.a = fneg half %a
2263 %fneg.b = fneg half %b
2264 %fma = call half @llvm.fma.f16(half %fneg.a, half %fneg.b, half %c)
2265 %fneg = fneg half %fma
2269 define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
2270 ; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2272 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2273 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2274 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2275 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2276 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2277 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2278 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2279 ; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2
2280 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2281 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2283 ; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2285 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2286 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2287 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2288 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2289 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2290 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2291 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2292 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2
2293 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2295 ; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2297 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2298 ; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, -v2
2299 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2300 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2302 ; VI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2304 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2305 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2
2306 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2308 ; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2309 ; GFX11-SAFE: ; %bb.0:
2310 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2311 ; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, -v2
2312 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2313 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2314 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2316 ; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
2317 ; GFX11-NSZ: ; %bb.0:
2318 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2319 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2
2320 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2321 %fneg.a = fneg half %a
2322 %fneg.c = fneg half %c
2323 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %fneg.c)
2324 %fneg = fneg half %fma
2328 define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
2329 ; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
2331 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2332 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2333 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2334 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2335 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2336 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2337 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2338 ; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2
2339 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2340 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2342 ; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
2344 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2345 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2346 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2347 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2348 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2349 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2350 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2351 ; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2
2352 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2354 ; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
2356 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2357 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, -v2
2358 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2359 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2361 ; VI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
2363 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2364 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2
2365 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2367 ; GFX11-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
2368 ; GFX11-SAFE: ; %bb.0:
2369 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2370 ; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, v1, -v2
2371 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2372 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2373 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2375 ; GFX11-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
2376 ; GFX11-NSZ: ; %bb.0:
2377 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2
2379 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2380 %fneg.c = fneg half %c
2381 %fma = call half @llvm.fma.f16(half %a, half %b, half %fneg.c)
2382 %fneg = fneg half %fma
2386 define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half %c) #0 {
2387 ; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2389 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2390 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2391 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2392 ; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v3, -v0
2393 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2394 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1
2395 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
2396 ; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
2397 ; SI-SAFE-NEXT: v_fma_f32 v0, v3, v4, v2
2398 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2399 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2401 ; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2403 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2404 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2405 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2406 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2407 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2408 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1
2409 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0
2410 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0
2411 ; SI-NSZ-NEXT: v_fma_f32 v0, v4, v3, -v2
2412 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2414 ; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2416 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v0
2418 ; VI-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2
2419 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2420 ; VI-SAFE-NEXT: v_mov_b32_e32 v1, v3
2421 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2423 ; VI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2425 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2426 ; VI-NSZ-NEXT: v_xor_b32_e32 v3, 0x8000, v0
2427 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
2428 ; VI-NSZ-NEXT: v_mov_b32_e32 v1, v3
2429 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2431 ; GFX11-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2432 ; GFX11-SAFE: ; %bb.0:
2433 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2434 ; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2
2435 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2436 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
2437 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
2438 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
2439 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2441 ; GFX11-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
2442 ; GFX11-NSZ: ; %bb.0:
2443 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2444 ; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2
2445 ; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
2446 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
2447 ; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
2448 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2449 %fneg.a = fneg half %a
2450 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
2451 %fneg = fneg half %fma
2452 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
2453 %insert.1 = insertvalue { half, half } %insert.0, half %fneg.a, 1
2454 ret { half, half } %insert.1
2457 define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half %c, half %d) #0 {
2458 ; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2460 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2461 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2462 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2463 ; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
2464 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3
2465 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2466 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2467 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0
2468 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
2469 ; SI-SAFE-NEXT: v_fma_f32 v0, v4, v1, v2
2470 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
2471 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v3
2472 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2474 ; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2476 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2477 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3
2478 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2479 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2480 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2481 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3
2482 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2483 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2484 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0
2485 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v0
2486 ; SI-NSZ-NEXT: v_fma_f32 v0, v4, v1, -v2
2487 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, v5, v3
2488 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2490 ; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2492 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2493 ; VI-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2
2494 ; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
2495 ; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v3
2496 ; VI-SAFE-NEXT: v_mov_b32_e32 v0, v2
2497 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2499 ; VI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2501 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2502 ; VI-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2
2503 ; VI-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v3
2504 ; VI-NSZ-NEXT: v_mov_b32_e32 v0, v2
2505 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2507 ; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2508 ; GFX11-SAFE: ; %bb.0:
2509 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2510 ; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2
2511 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2512 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
2513 ; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v3
2514 ; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
2515 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2517 ; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
2518 ; GFX11-NSZ: ; %bb.0:
2519 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2520 ; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2
2521 ; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v3
2522 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
2523 ; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
2524 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2525 %fneg.a = fneg half %a
2526 %fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
2527 %fneg = fneg half %fma
2528 %use1 = fmul half %fneg.a, %d
2529 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
2530 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
2531 ret { half, half } %insert.1
2534 ; --------------------------------------------------------------------------------
2536 ; --------------------------------------------------------------------------------
2538 define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
2539 ; SI-SAFE-LABEL: v_fneg_fmad_f16:
2541 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2542 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2543 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2544 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2545 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2546 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2547 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2548 ; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1
2549 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2
2550 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2552 ; SI-NSZ-LABEL: v_fneg_fmad_f16:
2554 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2555 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2556 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2557 ; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v1, -v1
2558 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2559 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2560 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2561 ; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2
2562 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2564 ; VI-SAFE-LABEL: v_fneg_fmad_f16:
2566 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2567 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, v1, v2
2568 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2569 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2571 ; VI-NSZ-LABEL: v_fneg_fmad_f16:
2573 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2574 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2575 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2577 ; GFX11-SAFE-LABEL: v_fneg_fmad_f16:
2578 ; GFX11-SAFE: ; %bb.0:
2579 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580 ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
2581 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2582 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
2583 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2585 ; GFX11-NSZ-LABEL: v_fneg_fmad_f16:
2586 ; GFX11-NSZ: ; %bb.0:
2587 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2588 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2589 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2590 %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
2591 %fneg = fneg half %fma
2595 define <4 x half> @v_fneg_fmad_v4f32(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 {
2596 ; SI-SAFE-LABEL: v_fneg_fmad_v4f32:
2598 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2599 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v8, v8
2600 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4
2601 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v9, v9
2602 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2603 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5
2604 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v10, v10
2605 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v6, v6
2606 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2607 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v11, v11
2608 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v7, v7
2609 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3
2610 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2611 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v8, v8
2612 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4
2613 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v9, v9
2614 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5
2615 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v10, v10
2616 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v6, v6
2617 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v11, v11
2618 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v7, v7
2619 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
2620 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2621 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2622 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2623 ; SI-SAFE-NEXT: v_mac_f32_e32 v11, v3, v7
2624 ; SI-SAFE-NEXT: v_mac_f32_e32 v10, v2, v6
2625 ; SI-SAFE-NEXT: v_mac_f32_e32 v9, v1, v5
2626 ; SI-SAFE-NEXT: v_mac_f32_e32 v8, v0, v4
2627 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v8
2628 ; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v9
2629 ; SI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80000000, v10
2630 ; SI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v11
2631 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2633 ; SI-NSZ-LABEL: v_fneg_fmad_v4f32:
2635 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v11, v11
2637 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3
2638 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v10, v10
2639 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v7, v7
2640 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2641 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v9, v9
2642 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2643 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v6, v6
2644 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v8, v8
2645 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2646 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4
2647 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5
2648 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v11, v11
2649 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3
2650 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v10, v10
2651 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2652 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v9, v9
2653 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
2654 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v8, v8
2655 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2656 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4
2657 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v5
2658 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v6, -v6
2659 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v7, -v7
2660 ; SI-NSZ-NEXT: v_mad_f32 v0, v0, v4, -v8
2661 ; SI-NSZ-NEXT: v_mad_f32 v1, v1, v5, -v9
2662 ; SI-NSZ-NEXT: v_mad_f32 v2, v2, v6, -v10
2663 ; SI-NSZ-NEXT: v_mad_f32 v3, v3, v7, -v11
2664 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2666 ; VI-SAFE-LABEL: v_fneg_fmad_v4f32:
2668 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2669 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v5
2670 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v3
2671 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v1
2672 ; VI-SAFE-NEXT: v_fma_f16 v6, v8, v7, v6
2673 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2674 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2675 ; VI-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2676 ; VI-SAFE-NEXT: v_fma_f16 v7, v9, v8, v7
2677 ; VI-SAFE-NEXT: v_fma_f16 v0, v0, v2, v4
2678 ; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2679 ; VI-SAFE-NEXT: v_fma_f16 v1, v1, v3, v5
2680 ; VI-SAFE-NEXT: v_or_b32_e32 v0, v0, v2
2681 ; VI-SAFE-NEXT: v_lshlrev_b32_e32 v2, 16, v6
2682 ; VI-SAFE-NEXT: v_or_b32_e32 v1, v1, v2
2683 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
2684 ; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
2685 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2687 ; VI-NSZ-LABEL: v_fneg_fmad_v4f32:
2689 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2690 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v6, 16, v5
2691 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v3
2692 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v1
2693 ; VI-NSZ-NEXT: v_fma_f16 v6, v8, -v7, -v6
2694 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v7, 16, v4
2695 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v8, 16, v2
2696 ; VI-NSZ-NEXT: v_lshrrev_b32_e32 v9, 16, v0
2697 ; VI-NSZ-NEXT: v_fma_f16 v7, v9, -v8, -v7
2698 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v2, -v4
2699 ; VI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v7
2700 ; VI-NSZ-NEXT: v_fma_f16 v1, v1, -v3, -v5
2701 ; VI-NSZ-NEXT: v_or_b32_e32 v0, v0, v2
2702 ; VI-NSZ-NEXT: v_lshlrev_b32_e32 v2, 16, v6
2703 ; VI-NSZ-NEXT: v_or_b32_e32 v1, v1, v2
2704 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2706 ; GFX11-SAFE-LABEL: v_fneg_fmad_v4f32:
2707 ; GFX11-SAFE: ; %bb.0:
2708 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2709 ; GFX11-SAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
2710 ; GFX11-SAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5
2711 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2712 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
2713 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
2714 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2716 ; GFX11-NSZ-LABEL: v_fneg_fmad_v4f32:
2717 ; GFX11-NSZ: ; %bb.0:
2718 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2719 ; GFX11-NSZ-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,1,1] neg_hi:[0,1,1]
2720 ; GFX11-NSZ-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,1,1] neg_hi:[0,1,1]
2721 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2722 %fma = call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
2723 %fneg = fneg <4 x half> %fma
2724 ret <4 x half> %fneg
2727 define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c) #0 {
2728 ; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2730 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
2732 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
2733 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
2734 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
2735 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
2736 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
2737 ; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1
2738 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2
2739 ; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2
2740 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
2742 ; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2744 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2745 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
2746 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
2747 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
2748 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
2749 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
2750 ; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1
2751 ; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2
2752 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
2753 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
2755 ; VI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2757 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2758 ; VI-SAFE-NEXT: v_fma_f16 v1, v0, v1, v2
2759 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1
2760 ; VI-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1
2761 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
2763 ; VI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2765 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2766 ; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2767 ; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
2768 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
2770 ; GFX11-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2771 ; GFX11-SAFE: ; %bb.0:
2772 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2773 ; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
2774 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
2775 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
2776 ; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2
2777 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
2779 ; GFX11-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
2780 ; GFX11-NSZ: ; %bb.0:
2781 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2782 ; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
2783 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
2784 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
2785 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
2786 %fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
2787 %fneg = fneg half %fma
2788 %use1 = fmul half %fma, 4.0
2789 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
2790 %insert.1 = insertvalue { half, half } %insert.0, half %use1, 1
2791 ret { half, half } %insert.1
2794 ; --------------------------------------------------------------------------------
2796 ; --------------------------------------------------------------------------------
2798 define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 {
2799 ; SI-LABEL: v_fneg_fp_extend_f16_to_f64:
2801 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2802 ; SI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v0
2803 ; SI-NEXT: s_setpc_b64 s[30:31]
2805 ; VI-LABEL: v_fneg_fp_extend_f16_to_f64:
2807 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2808 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2809 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
2810 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2811 ; VI-NEXT: s_setpc_b64 s[30:31]
2813 ; GFX11-LABEL: v_fneg_fp_extend_f16_to_f64:
2815 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2816 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
2817 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2818 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
2819 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2820 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2821 %fpext = fpext half %a to double
2822 %fneg = fneg double %fpext
2826 define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 {
2827 ; SI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
2829 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2830 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2831 ; SI-NEXT: s_setpc_b64 s[30:31]
2833 ; VI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
2835 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2836 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
2837 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2838 ; VI-NEXT: s_setpc_b64 s[30:31]
2840 ; GFX11-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
2842 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2843 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
2844 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2845 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2846 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2847 %fneg.a = fneg half %a
2848 %fpext = fpext half %fneg.a to double
2849 %fneg = fneg double %fpext
2853 define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0 {
2854 ; SI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
2856 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2857 ; SI-NEXT: v_mov_b32_e32 v2, v0
2858 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
2859 ; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
2860 ; SI-NEXT: s_setpc_b64 s[30:31]
2862 ; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
2864 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2865 ; VI-NEXT: v_mov_b32_e32 v2, v0
2866 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v2
2867 ; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v2
2868 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2869 ; VI-NEXT: s_setpc_b64 s[30:31]
2871 ; GFX11-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
2873 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2874 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
2875 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2876 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
2877 ; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
2878 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2879 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2880 %fneg.a = fneg half %a
2881 %fpext = fpext half %fneg.a to double
2882 %fneg = fneg double %fpext
2883 %insert.0 = insertvalue { double, half } poison, double %fneg, 0
2884 %insert.1 = insertvalue { double, half } %insert.0, half %fneg.a, 1
2885 ret { double, half } %insert.1
2888 define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #0 {
2889 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
2891 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2892 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
2893 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
2894 ; SI-NEXT: v_mov_b32_e32 v0, v2
2895 ; SI-NEXT: s_setpc_b64 s[30:31]
2897 ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
2899 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2900 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
2901 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
2902 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
2903 ; VI-NEXT: v_mov_b32_e32 v0, v2
2904 ; VI-NEXT: s_setpc_b64 s[30:31]
2906 ; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
2908 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2909 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
2910 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2911 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
2912 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
2913 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2914 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
2915 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2916 %fpext = fpext half %a to double
2917 %fneg = fneg double %fpext
2918 %insert.0 = insertvalue { double, double } poison, double %fneg, 0
2919 %insert.1 = insertvalue { double, double } %insert.0, double %fpext, 1
2920 ret { double, double } %insert.1
2923 define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(half %a) #0 {
2924 ; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
2926 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2927 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2928 ; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1
2929 ; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
2930 ; SI-NEXT: v_mov_b32_e32 v1, v4
2931 ; SI-NEXT: s_setpc_b64 s[30:31]
2933 ; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
2935 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2936 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
2937 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2938 ; VI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
2939 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2940 ; VI-NEXT: s_setpc_b64 s[30:31]
2942 ; GFX11-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
2944 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2945 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
2946 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2947 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
2948 ; GFX11-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
2949 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2950 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2951 %fpext = fpext half %a to double
2952 %fneg = fneg double %fpext
2953 %mul = fmul double %fpext, 4.0
2954 %insert.0 = insertvalue { double, double } poison, double %fneg, 0
2955 %insert.1 = insertvalue { double, double } %insert.0, double %mul, 1
2956 ret { double, double } %insert.1
2959 define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0 {
2960 ; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
2962 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2963 ; SI-NEXT: v_mov_b32_e32 v1, v0
2964 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
2965 ; SI-NEXT: s_setpc_b64 s[30:31]
2967 ; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
2969 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2970 ; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
2971 ; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
2972 ; VI-NEXT: s_setpc_b64 s[30:31]
2974 ; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
2976 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2977 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
2978 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2979 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
2980 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2981 %fpext = fpext half %a to float
2982 %fneg = fneg float %fpext
2983 %insert.0 = insertvalue { float, float } poison, float %fneg, 0
2984 %insert.1 = insertvalue { float, float } %insert.0, float %fpext, 1
2985 ret { float, float } %insert.1
2988 ; --------------------------------------------------------------------------------
2990 ; --------------------------------------------------------------------------------
2992 define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
2993 ; SI-LABEL: v_fneg_fp_round_f64_to_f16:
2995 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2996 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2997 ; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
2998 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
2999 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3000 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3001 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3002 ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3003 ; SI-NEXT: v_bfe_u32 v3, v1, 20, 11
3004 ; SI-NEXT: s_movk_i32 s4, 0x3f1
3005 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3006 ; SI-NEXT: v_sub_i32_e32 v4, vcc, s4, v3
3007 ; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3008 ; SI-NEXT: v_med3_i32 v4, v4, 0, 13
3009 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3010 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3011 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3012 ; SI-NEXT: s_movk_i32 s4, 0xfc10
3013 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3014 ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3
3015 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3016 ; SI-NEXT: v_or_b32_e32 v2, v5, v2
3017 ; SI-NEXT: v_or_b32_e32 v4, v0, v4
3018 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3019 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3020 ; SI-NEXT: v_and_b32_e32 v4, 7, v2
3021 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3022 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3023 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3024 ; SI-NEXT: s_or_b64 vcc, s[4:5], vcc
3025 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3026 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00
3027 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3028 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3029 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3030 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3031 ; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3032 ; SI-NEXT: s_movk_i32 s4, 0x40f
3033 ; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3034 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3035 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3036 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3037 ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
3038 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
3039 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3040 ; SI-NEXT: s_setpc_b64 s[30:31]
3042 ; VI-LABEL: v_fneg_fp_round_f64_to_f16:
3044 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3045 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
3046 ; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
3047 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3048 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3049 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3050 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3051 ; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3052 ; VI-NEXT: v_bfe_u32 v3, v1, 20, 11
3053 ; VI-NEXT: s_movk_i32 s4, 0x3f1
3054 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3055 ; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v3
3056 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3057 ; VI-NEXT: v_med3_i32 v4, v4, 0, 13
3058 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3059 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3060 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3061 ; VI-NEXT: s_movk_i32 s4, 0xfc10
3062 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3063 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3
3064 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3065 ; VI-NEXT: v_or_b32_e32 v2, v5, v2
3066 ; VI-NEXT: v_or_b32_e32 v4, v0, v4
3067 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3068 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3069 ; VI-NEXT: v_and_b32_e32 v4, 7, v2
3070 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3071 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3072 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3073 ; VI-NEXT: s_or_b64 vcc, s[4:5], vcc
3074 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3075 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00
3076 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3077 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3078 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3079 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3080 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3081 ; VI-NEXT: s_movk_i32 s4, 0x40f
3082 ; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3083 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3084 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3085 ; VI-NEXT: v_mov_b32_e32 v2, 0x8000
3086 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3087 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
3088 ; VI-NEXT: s_setpc_b64 s[30:31]
3090 ; GFX11-LABEL: v_fneg_fp_round_f64_to_f16:
3092 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3093 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
3094 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
3095 ; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
3096 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3097 ; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11
3098 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3099 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3100 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3101 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
3102 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
3103 ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
3104 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3105 ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13
3106 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0
3107 ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0
3108 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3109 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4
3110 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5
3111 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3112 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
3113 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
3114 ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
3115 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
3116 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3117 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0
3118 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
3119 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
3120 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
3121 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3122 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
3123 ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
3124 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3125 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3126 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
3127 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
3128 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3129 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3130 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
3131 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3132 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
3133 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3134 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3135 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3136 ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
3137 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3138 %fpround = fptrunc double %a to half
3139 %fneg = fneg half %fpround
3143 define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
3144 ; SI-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
3146 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3147 ; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
3148 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3149 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3150 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3151 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3152 ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3153 ; SI-NEXT: v_bfe_u32 v3, v1, 20, 11
3154 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3155 ; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3
3156 ; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3157 ; SI-NEXT: v_med3_i32 v4, v4, 0, 13
3158 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3159 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3160 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3161 ; SI-NEXT: s_movk_i32 s4, 0xfc10
3162 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3163 ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3
3164 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3165 ; SI-NEXT: v_or_b32_e32 v2, v5, v2
3166 ; SI-NEXT: v_or_b32_e32 v4, v0, v4
3167 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3168 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3169 ; SI-NEXT: v_and_b32_e32 v4, 7, v2
3170 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3171 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3172 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3173 ; SI-NEXT: s_or_b64 vcc, s[4:5], vcc
3174 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3175 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00
3176 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3177 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3178 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3179 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3180 ; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3181 ; SI-NEXT: s_movk_i32 s4, 0x40f
3182 ; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3183 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3184 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3185 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3186 ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
3187 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
3188 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3189 ; SI-NEXT: s_setpc_b64 s[30:31]
3191 ; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
3193 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3194 ; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
3195 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3196 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3197 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3198 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3199 ; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3200 ; VI-NEXT: v_bfe_u32 v3, v1, 20, 11
3201 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3202 ; VI-NEXT: v_sub_u32_e32 v4, vcc, 0x3f1, v3
3203 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3204 ; VI-NEXT: v_med3_i32 v4, v4, 0, 13
3205 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3206 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3207 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3208 ; VI-NEXT: s_movk_i32 s4, 0xfc10
3209 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3210 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3
3211 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3212 ; VI-NEXT: v_or_b32_e32 v2, v5, v2
3213 ; VI-NEXT: v_or_b32_e32 v4, v0, v4
3214 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3215 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3216 ; VI-NEXT: v_and_b32_e32 v4, 7, v2
3217 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3218 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3219 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3220 ; VI-NEXT: s_or_b64 vcc, s[4:5], vcc
3221 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3222 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00
3223 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3224 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3225 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3226 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3227 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3228 ; VI-NEXT: s_movk_i32 s4, 0x40f
3229 ; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3230 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3231 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3232 ; VI-NEXT: v_mov_b32_e32 v2, 0x8000
3233 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3234 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
3235 ; VI-NEXT: s_setpc_b64 s[30:31]
3237 ; GFX11-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
3239 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3240 ; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
3241 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3242 ; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11
3243 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3244 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
3245 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3246 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
3247 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
3248 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3249 ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
3250 ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13
3251 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3252 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0
3253 ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0
3254 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4
3255 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3256 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5
3257 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
3258 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
3259 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3260 ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
3261 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
3262 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0
3263 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
3264 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
3265 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3266 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
3267 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
3268 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3269 ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
3270 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3271 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
3272 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
3273 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3274 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3275 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
3276 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3277 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3278 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
3279 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3280 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3281 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3282 ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
3283 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3284 %fneg.a = fneg double %a
3285 %fpround = fptrunc double %fneg.a to half
3286 %fneg = fneg half %fpround
3290 define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 {
3291 ; SI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
3293 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3294 ; SI-NEXT: v_mov_b32_e32 v3, v0
3295 ; SI-NEXT: v_and_b32_e32 v0, 0x1ff, v1
3296 ; SI-NEXT: v_or_b32_e32 v0, v0, v3
3297 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3298 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3299 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3300 ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3301 ; SI-NEXT: v_bfe_u32 v4, v1, 20, 11
3302 ; SI-NEXT: s_movk_i32 s4, 0x3f1
3303 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3304 ; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4
3305 ; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3306 ; SI-NEXT: v_med3_i32 v5, v5, 0, 13
3307 ; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2
3308 ; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6
3309 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2
3310 ; SI-NEXT: s_movk_i32 s4, 0xfc10
3311 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3312 ; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4
3313 ; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4
3314 ; SI-NEXT: v_or_b32_e32 v2, v6, v2
3315 ; SI-NEXT: v_or_b32_e32 v5, v0, v5
3316 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4
3317 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
3318 ; SI-NEXT: v_and_b32_e32 v5, 7, v2
3319 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
3320 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v5
3321 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3322 ; SI-NEXT: s_or_b64 vcc, s[4:5], vcc
3323 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3324 ; SI-NEXT: v_mov_b32_e32 v5, 0x7c00
3325 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4
3326 ; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
3327 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3328 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3329 ; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3330 ; SI-NEXT: s_movk_i32 s4, 0x40f
3331 ; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3332 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4
3333 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3334 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
3335 ; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
3336 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3337 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3338 ; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
3339 ; SI-NEXT: v_mov_b32_e32 v1, v3
3340 ; SI-NEXT: s_setpc_b64 s[30:31]
3342 ; VI-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
3344 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3345 ; VI-NEXT: v_mov_b32_e32 v3, v0
3346 ; VI-NEXT: v_and_b32_e32 v0, 0x1ff, v1
3347 ; VI-NEXT: v_or_b32_e32 v0, v0, v3
3348 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3349 ; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v1
3350 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3351 ; VI-NEXT: v_and_b32_e32 v4, 0xffe, v4
3352 ; VI-NEXT: v_bfe_u32 v5, v1, 20, 11
3353 ; VI-NEXT: s_movk_i32 s4, 0x3f1
3354 ; VI-NEXT: v_or_b32_e32 v0, v4, v0
3355 ; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v5
3356 ; VI-NEXT: v_or_b32_e32 v4, 0x1000, v0
3357 ; VI-NEXT: v_med3_i32 v6, v6, 0, 13
3358 ; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v4
3359 ; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7
3360 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4
3361 ; VI-NEXT: s_movk_i32 s4, 0xfc10
3362 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3363 ; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v5
3364 ; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v5
3365 ; VI-NEXT: v_or_b32_e32 v4, v7, v4
3366 ; VI-NEXT: v_or_b32_e32 v6, v0, v6
3367 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
3368 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
3369 ; VI-NEXT: v_and_b32_e32 v6, 7, v4
3370 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
3371 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v6
3372 ; VI-NEXT: v_lshrrev_b32_e32 v4, 2, v4
3373 ; VI-NEXT: s_or_b64 vcc, s[4:5], vcc
3374 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
3375 ; VI-NEXT: v_mov_b32_e32 v6, 0x7c00
3376 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
3377 ; VI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
3378 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3379 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3380 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3381 ; VI-NEXT: s_movk_i32 s4, 0x40f
3382 ; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3383 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5
3384 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
3385 ; VI-NEXT: v_mov_b32_e32 v4, 0x8000
3386 ; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
3387 ; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3388 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
3389 ; VI-NEXT: v_mov_b32_e32 v1, v3
3390 ; VI-NEXT: s_setpc_b64 s[30:31]
3392 ; GFX11-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
3394 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3395 ; GFX11-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
3396 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
3397 ; GFX11-NEXT: v_bfe_u32 v4, v1, 20, 11
3398 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3399 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
3400 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
3401 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
3402 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3403 ; GFX11-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
3404 ; GFX11-NEXT: v_med3_i32 v3, v5, 0, 13
3405 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3406 ; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2
3407 ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v2
3408 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5
3409 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3410 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6
3411 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
3412 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
3413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3414 ; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
3415 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
3416 ; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2
3417 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
3418 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
3419 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3420 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
3421 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, 9, 0x7c00
3422 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3423 ; GFX11-NEXT: v_and_b32_e32 v5, 7, v3
3424 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3
3425 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
3426 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
3427 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
3429 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3430 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
3431 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
3432 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
3433 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
3434 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3435 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
3436 ; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
3437 ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
3438 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
3439 ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
3440 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3441 %fneg.a = fneg double %a
3442 %fpround = fptrunc double %fneg.a to half
3443 %fneg = fneg half %fpround
3444 %insert.0 = insertvalue { half, double } poison, half %fneg, 0
3445 %insert.1 = insertvalue { half, double } %insert.0, double %fneg.a, 1
3446 ret { half, double } %insert.1
3449 define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, double %c) #0 {
3450 ; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
3452 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3453 ; SI-NEXT: v_and_b32_e32 v4, 0x1ff, v1
3454 ; SI-NEXT: v_or_b32_e32 v4, v4, v0
3455 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3456 ; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3457 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3458 ; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5
3459 ; SI-NEXT: v_bfe_u32 v6, v1, 20, 11
3460 ; SI-NEXT: s_movk_i32 s4, 0x3f1
3461 ; SI-NEXT: v_or_b32_e32 v4, v5, v4
3462 ; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6
3463 ; SI-NEXT: v_or_b32_e32 v5, 0x1000, v4
3464 ; SI-NEXT: v_med3_i32 v7, v7, 0, 13
3465 ; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v5
3466 ; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8
3467 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5
3468 ; SI-NEXT: s_movk_i32 s4, 0xfc10
3469 ; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
3470 ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v6
3471 ; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6
3472 ; SI-NEXT: v_or_b32_e32 v5, v8, v5
3473 ; SI-NEXT: v_or_b32_e32 v7, v4, v7
3474 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
3475 ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
3476 ; SI-NEXT: v_and_b32_e32 v7, 7, v5
3477 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
3478 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7
3479 ; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
3480 ; SI-NEXT: s_or_b64 vcc, s[4:5], vcc
3481 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
3482 ; SI-NEXT: v_mov_b32_e32 v7, 0x7c00
3483 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
3484 ; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
3485 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3486 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3487 ; SI-NEXT: v_lshlrev_b32_e32 v4, 9, v4
3488 ; SI-NEXT: s_movk_i32 s4, 0x40f
3489 ; SI-NEXT: v_or_b32_e32 v4, 0x7c00, v4
3490 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6
3491 ; SI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
3492 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
3493 ; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
3494 ; SI-NEXT: v_or_b32_e32 v4, v5, v4
3495 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
3496 ; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3]
3497 ; SI-NEXT: v_mov_b32_e32 v0, v4
3498 ; SI-NEXT: s_setpc_b64 s[30:31]
3500 ; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
3502 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3503 ; VI-NEXT: v_and_b32_e32 v4, 0x1ff, v1
3504 ; VI-NEXT: v_or_b32_e32 v4, v4, v0
3505 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3506 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3507 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3508 ; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
3509 ; VI-NEXT: v_bfe_u32 v6, v1, 20, 11
3510 ; VI-NEXT: s_movk_i32 s4, 0x3f1
3511 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
3512 ; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v6
3513 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v4
3514 ; VI-NEXT: v_med3_i32 v7, v7, 0, 13
3515 ; VI-NEXT: v_lshrrev_b32_e32 v8, v7, v5
3516 ; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v8
3517 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5
3518 ; VI-NEXT: s_movk_i32 s4, 0xfc10
3519 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
3520 ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6
3521 ; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v6
3522 ; VI-NEXT: v_or_b32_e32 v5, v8, v5
3523 ; VI-NEXT: v_or_b32_e32 v7, v4, v7
3524 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
3525 ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
3526 ; VI-NEXT: v_and_b32_e32 v7, 7, v5
3527 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
3528 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v7
3529 ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
3530 ; VI-NEXT: s_or_b64 vcc, s[4:5], vcc
3531 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
3532 ; VI-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3]
3533 ; VI-NEXT: v_mov_b32_e32 v7, 0x7c00
3534 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
3535 ; VI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
3536 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
3537 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
3538 ; VI-NEXT: v_lshlrev_b32_e32 v4, 9, v4
3539 ; VI-NEXT: s_movk_i32 s4, 0x40f
3540 ; VI-NEXT: v_or_b32_e32 v4, 0x7c00, v4
3541 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6
3542 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc
3543 ; VI-NEXT: v_mov_b32_e32 v4, 0x8000
3544 ; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3545 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
3546 ; VI-NEXT: v_mov_b32_e32 v1, v2
3547 ; VI-NEXT: v_mov_b32_e32 v2, v3
3548 ; VI-NEXT: s_setpc_b64 s[30:31]
3550 ; GFX11-LABEL: v_fneg_fp_round_multi_use_fneg_f64_to_f16:
3552 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3553 ; GFX11-NEXT: v_and_or_b32 v4, 0x1ff, v1, v0
3554 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1
3555 ; GFX11-NEXT: v_bfe_u32 v6, v1, 20, 11
3556 ; GFX11-NEXT: v_mul_f64 v[2:3], -v[0:1], v[2:3]
3557 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3558 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
3559 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
3560 ; GFX11-NEXT: v_sub_nc_u32_e32 v7, 0x3f1, v6
3561 ; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6
3562 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
3563 ; GFX11-NEXT: v_and_or_b32 v4, 0xffe, v5, v4
3564 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
3565 ; GFX11-NEXT: v_med3_i32 v5, v7, 0, 13
3566 ; GFX11-NEXT: v_or_b32_e32 v7, 0x1000, v4
3567 ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v4
3568 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3569 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, v5, v7
3570 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, v5, v8
3571 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
3572 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v7
3573 ; GFX11-NEXT: v_lshl_or_b32 v7, v6, 12, v4
3574 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
3575 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
3576 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6
3577 ; GFX11-NEXT: v_lshl_or_b32 v4, v4, 9, 0x7c00
3578 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3579 ; GFX11-NEXT: v_or_b32_e32 v5, v8, v5
3580 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc_lo
3581 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3582 ; GFX11-NEXT: v_and_b32_e32 v5, 7, v0
3583 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v0
3584 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
3585 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v5
3586 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3587 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3588 ; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
3589 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
3590 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3591 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
3592 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
3593 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
3594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3595 ; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
3596 ; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
3597 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3598 %fneg.a = fneg double %a
3599 %fpround = fptrunc double %fneg.a to half
3600 %fneg = fneg half %fpround
3601 %use1 = fmul double %fneg.a, %c
3602 %insert.0 = insertvalue { half, double } poison, half %fneg, 0
3603 %insert.1 = insertvalue { half, double } %insert.0, double %use1, 1
3604 ret { half, double } %insert.1
3607 define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
3608 ; SI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
3610 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3611 ; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
3612 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3613 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3614 ; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3615 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3616 ; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3617 ; SI-NEXT: v_bfe_u32 v3, v1, 20, 11
3618 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
3619 ; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3
3620 ; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3621 ; SI-NEXT: v_med3_i32 v4, v4, 0, 13
3622 ; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3623 ; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3624 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3625 ; SI-NEXT: s_movk_i32 s4, 0xfc10
3626 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3627 ; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3
3628 ; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3629 ; SI-NEXT: v_or_b32_e32 v2, v5, v2
3630 ; SI-NEXT: v_or_b32_e32 v4, v0, v4
3631 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3632 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3633 ; SI-NEXT: v_and_b32_e32 v4, 7, v2
3634 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3635 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3636 ; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3637 ; SI-NEXT: s_or_b64 vcc, s[4:5], vcc
3638 ; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3639 ; SI-NEXT: v_mov_b32_e32 v4, 0x7c00
3640 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3641 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3642 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3643 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3644 ; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3645 ; SI-NEXT: s_movk_i32 s4, 0x40f
3646 ; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3647 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3648 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3649 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3650 ; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
3651 ; SI-NEXT: v_or_b32_e32 v1, v1, v0
3652 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1
3653 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
3654 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
3655 ; SI-NEXT: s_setpc_b64 s[30:31]
3657 ; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
3659 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3660 ; VI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
3661 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3662 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3663 ; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3664 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3665 ; VI-NEXT: v_and_b32_e32 v2, 0xffe, v2
3666 ; VI-NEXT: v_bfe_u32 v3, v1, 20, 11
3667 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
3668 ; VI-NEXT: v_sub_u32_e32 v4, vcc, 0x3f1, v3
3669 ; VI-NEXT: v_or_b32_e32 v2, 0x1000, v0
3670 ; VI-NEXT: v_med3_i32 v4, v4, 0, 13
3671 ; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
3672 ; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
3673 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
3674 ; VI-NEXT: s_movk_i32 s4, 0xfc10
3675 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
3676 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3
3677 ; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
3678 ; VI-NEXT: v_or_b32_e32 v2, v5, v2
3679 ; VI-NEXT: v_or_b32_e32 v4, v0, v4
3680 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
3681 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3682 ; VI-NEXT: v_and_b32_e32 v4, 7, v2
3683 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
3684 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v4
3685 ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3686 ; VI-NEXT: s_or_b64 vcc, s[4:5], vcc
3687 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
3688 ; VI-NEXT: v_mov_b32_e32 v4, 0x7c00
3689 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
3690 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
3691 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
3692 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
3693 ; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0
3694 ; VI-NEXT: s_movk_i32 s4, 0x40f
3695 ; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0
3696 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3
3697 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
3698 ; VI-NEXT: v_mov_b32_e32 v2, 0x8000
3699 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3700 ; VI-NEXT: v_or_b32_e32 v1, v1, v0
3701 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
3702 ; VI-NEXT: s_setpc_b64 s[30:31]
3704 ; GFX11-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
3706 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3707 ; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
3708 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
3709 ; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11
3710 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
3711 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
3712 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
3713 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
3714 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
3715 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3716 ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
3717 ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13
3718 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3719 ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0
3720 ; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, v0
3721 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4
3722 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3723 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5
3724 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
3725 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
3726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
3727 ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
3728 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
3729 ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0
3730 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
3731 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
3732 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3733 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
3734 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 9, 0x7c00
3735 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3736 ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
3737 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
3738 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
3739 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v4
3740 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
3741 ; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
3742 ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
3743 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
3744 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3745 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
3746 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
3747 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
3748 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3749 ; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
3750 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
3751 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3752 %fpround = fptrunc double %a to half
3753 %fneg = fneg half %fpround
3754 %insert.0 = insertvalue { half, half } poison, half %fneg, 0
3755 %insert.1 = insertvalue { half, half } %insert.0, half %fpround, 1
3756 ret { half, half } %insert.1
3759 ; --------------------------------------------------------------------------------
3761 ; --------------------------------------------------------------------------------
3763 define half @v_fneg_trunc_f16(half %a) #0 {
3764 ; SI-LABEL: v_fneg_trunc_f16:
3766 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3767 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
3768 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3769 ; SI-NEXT: v_trunc_f32_e32 v0, v0
3770 ; SI-NEXT: s_setpc_b64 s[30:31]
3772 ; VI-LABEL: v_fneg_trunc_f16:
3774 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3775 ; VI-NEXT: v_trunc_f16_e64 v0, -v0
3776 ; VI-NEXT: s_setpc_b64 s[30:31]
3778 ; GFX11-LABEL: v_fneg_trunc_f16:
3780 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3781 ; GFX11-NEXT: v_trunc_f16_e64 v0, -v0
3782 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3783 %trunc = call half @llvm.trunc.f16(half %a)
3784 %fneg = fneg half %trunc
3788 ; --------------------------------------------------------------------------------
3790 ; --------------------------------------------------------------------------------
3792 define half @v_fneg_round_f16(half %a) #0 {
3793 ; SI-SAFE-LABEL: v_fneg_round_f16:
3795 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3796 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
3797 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
3798 ; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0
3799 ; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1
3800 ; SI-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3801 ; SI-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3802 ; SI-SAFE-NEXT: s_brev_b32 s4, -2
3803 ; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3804 ; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0
3805 ; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
3806 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
3808 ; SI-NSZ-LABEL: v_fneg_round_f16:
3810 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3811 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
3812 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
3813 ; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0
3814 ; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1
3815 ; SI-NSZ-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
3816 ; SI-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5]
3817 ; SI-NSZ-NEXT: s_brev_b32 s4, -2
3818 ; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3819 ; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0
3820 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
3822 ; VI-SAFE-LABEL: v_fneg_round_f16:
3824 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3825 ; VI-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3826 ; VI-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3827 ; VI-SAFE-NEXT: v_mov_b32_e32 v3, 0x3c00
3828 ; VI-SAFE-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3829 ; VI-SAFE-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
3830 ; VI-SAFE-NEXT: s_movk_i32 s4, 0x7fff
3831 ; VI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
3832 ; VI-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
3833 ; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
3834 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
3836 ; VI-NSZ-LABEL: v_fneg_round_f16:
3838 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3839 ; VI-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3840 ; VI-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3841 ; VI-NSZ-NEXT: v_mov_b32_e32 v3, 0x3c00
3842 ; VI-NSZ-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
3843 ; VI-NSZ-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
3844 ; VI-NSZ-NEXT: s_movk_i32 s4, 0x7fff
3845 ; VI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
3846 ; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
3847 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
3849 ; GFX11-SAFE-LABEL: v_fneg_round_f16:
3850 ; GFX11-SAFE: ; %bb.0:
3851 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3852 ; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0
3853 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3854 ; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
3855 ; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3856 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3857 ; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3858 ; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3859 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3860 ; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
3861 ; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
3862 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
3864 ; GFX11-NSZ-LABEL: v_fneg_round_f16:
3865 ; GFX11-NSZ: ; %bb.0:
3866 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3867 ; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0
3868 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3869 ; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
3870 ; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
3871 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3872 ; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
3873 ; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
3874 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
3875 ; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
3876 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
3877 %round = call half @llvm.round.f16(half %a)
3878 %fneg = fneg half %round
3882 ; --------------------------------------------------------------------------------
3884 ; --------------------------------------------------------------------------------
3886 define half @v_fneg_rint_f16(half %a) #0 {
3887 ; SI-LABEL: v_fneg_rint_f16:
3889 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3890 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
3891 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3892 ; SI-NEXT: v_rndne_f32_e32 v0, v0
3893 ; SI-NEXT: s_setpc_b64 s[30:31]
3895 ; VI-LABEL: v_fneg_rint_f16:
3897 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3898 ; VI-NEXT: v_rndne_f16_e64 v0, -v0
3899 ; VI-NEXT: s_setpc_b64 s[30:31]
3901 ; GFX11-LABEL: v_fneg_rint_f16:
3903 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3904 ; GFX11-NEXT: v_rndne_f16_e64 v0, -v0
3905 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3906 %rint = call half @llvm.rint.f16(half %a)
3907 %fneg = fneg half %rint
3911 ; --------------------------------------------------------------------------------
3913 ; --------------------------------------------------------------------------------
3915 define half @v_fneg_nearbyint_f16(half %a) #0 {
3916 ; SI-LABEL: v_fneg_nearbyint_f16:
3918 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3919 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
3920 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3921 ; SI-NEXT: v_rndne_f32_e32 v0, v0
3922 ; SI-NEXT: s_setpc_b64 s[30:31]
3924 ; VI-LABEL: v_fneg_nearbyint_f16:
3926 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3927 ; VI-NEXT: v_rndne_f16_e64 v0, -v0
3928 ; VI-NEXT: s_setpc_b64 s[30:31]
3930 ; GFX11-LABEL: v_fneg_nearbyint_f16:
3932 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3933 ; GFX11-NEXT: v_rndne_f16_e64 v0, -v0
3934 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3935 %nearbyint = call half @llvm.nearbyint.f16(half %a)
3936 %fneg = fneg half %nearbyint
3940 ; --------------------------------------------------------------------------------
3942 ; --------------------------------------------------------------------------------
3944 define half @v_fneg_sin_f16(half %a) #0 {
3945 ; SI-LABEL: v_fneg_sin_f16:
3947 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3948 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
3949 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
3950 ; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0
3951 ; SI-NEXT: v_fract_f32_e32 v0, v0
3952 ; SI-NEXT: v_sin_f32_e32 v0, v0
3953 ; SI-NEXT: s_setpc_b64 s[30:31]
3955 ; VI-LABEL: v_fneg_sin_f16:
3957 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3958 ; VI-NEXT: v_mul_f16_e32 v0, 0xb118, v0
3959 ; VI-NEXT: v_fract_f16_e32 v0, v0
3960 ; VI-NEXT: v_sin_f16_e32 v0, v0
3961 ; VI-NEXT: s_setpc_b64 s[30:31]
3963 ; GFX11-LABEL: v_fneg_sin_f16:
3965 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3966 ; GFX11-NEXT: v_mul_f16_e32 v0, 0xb118, v0
3967 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3968 ; GFX11-NEXT: v_sin_f16_e32 v0, v0
3969 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3970 %sin = call half @llvm.sin.f16(half %a)
3971 %fneg = fneg half %sin
3975 ; --------------------------------------------------------------------------------
3976 ; fcanonicalize tests
3977 ; --------------------------------------------------------------------------------
3979 define half @v_fneg_canonicalize_f16(half %a) #0 {
3980 ; SI-LABEL: v_fneg_canonicalize_f16:
3982 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3983 ; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
3984 ; SI-NEXT: s_setpc_b64 s[30:31]
3986 ; VI-LABEL: v_fneg_canonicalize_f16:
3988 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3989 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
3990 ; VI-NEXT: s_setpc_b64 s[30:31]
3992 ; GFX11-LABEL: v_fneg_canonicalize_f16:
3994 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3995 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
3996 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3997 %trunc = call half @llvm.canonicalize.f16(half %a)
3998 %fneg = fneg half %trunc
4002 ; --------------------------------------------------------------------------------
4004 ; --------------------------------------------------------------------------------
4006 define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 {
4007 ; SI-LABEL: v_fneg_copytoreg_f16:
4009 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4010 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
4011 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4012 ; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31
4013 ; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6
4014 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
4015 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4016 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6
4017 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4018 ; SI-NEXT: v_mul_f32_e32 v2, v2, v3
4019 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
4020 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
4021 ; SI-NEXT: s_cbranch_execz .LBB81_2
4022 ; SI-NEXT: ; %bb.1: ; %if
4023 ; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2
4024 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
4025 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
4026 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
4027 ; SI-NEXT: v_mul_f32_e32 v3, v3, v4
4028 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
4029 ; SI-NEXT: flat_store_short v[0:1], v3
4030 ; SI-NEXT: s_waitcnt vmcnt(0)
4031 ; SI-NEXT: .LBB81_2: ; %endif
4032 ; SI-NEXT: s_or_b64 exec, exec, s[4:5]
4033 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4034 ; SI-NEXT: flat_store_short v[0:1], v2
4035 ; SI-NEXT: s_waitcnt vmcnt(0)
4036 ; SI-NEXT: s_setpc_b64 s[30:31]
4038 ; VI-LABEL: v_fneg_copytoreg_f16:
4040 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4041 ; VI-NEXT: v_and_b32_e32 v6, 0x3ff, v31
4042 ; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v6
4043 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6
4044 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4045 ; VI-NEXT: v_mul_f16_e32 v2, v2, v3
4046 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
4047 ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
4048 ; VI-NEXT: s_cbranch_execz .LBB81_2
4049 ; VI-NEXT: ; %bb.1: ; %if
4050 ; VI-NEXT: v_mul_f16_e64 v3, -v2, v4
4051 ; VI-NEXT: flat_store_short v[0:1], v3
4052 ; VI-NEXT: s_waitcnt vmcnt(0)
4053 ; VI-NEXT: .LBB81_2: ; %endif
4054 ; VI-NEXT: s_or_b64 exec, exec, s[4:5]
4055 ; VI-NEXT: flat_store_short v[0:1], v2
4056 ; VI-NEXT: s_waitcnt vmcnt(0)
4057 ; VI-NEXT: s_setpc_b64 s[30:31]
4059 ; GFX11-LABEL: v_fneg_copytoreg_f16:
4061 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4062 ; GFX11-NEXT: v_and_b32_e32 v6, 0x3ff, v31
4063 ; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3
4064 ; GFX11-NEXT: s_mov_b32 s0, exec_lo
4065 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4066 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6
4067 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
4068 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
4069 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5
4070 ; GFX11-NEXT: s_cbranch_execz .LBB81_2
4071 ; GFX11-NEXT: ; %bb.1: ; %if
4072 ; GFX11-NEXT: v_mul_f16_e64 v3, -v2, v4
4073 ; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
4074 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4075 ; GFX11-NEXT: .LBB81_2: ; %endif
4076 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
4077 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc
4078 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4079 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4080 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4081 %tid.ext = sext i32 %tid to i64
4082 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4083 %mul = fmul half %a, %b
4084 %fneg = fneg half %mul
4085 %cmp0 = icmp eq i32 %d, 0
4086 br i1 %cmp0, label %if, label %endif
4089 %mul1 = fmul half %fneg, %c
4090 store volatile half %mul1, ptr addrspace(1) %out.gep
4094 store volatile half %mul, ptr addrspace(1) %out.gep
4098 ; --------------------------------------------------------------------------------
4100 ; --------------------------------------------------------------------------------
4102 ; Can't fold into use, so should fold into source
4103 define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
4104 ; SI-LABEL: v_fneg_inlineasm_f16:
4106 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4107 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4108 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
4109 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4110 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4111 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4112 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v0
4113 ; SI-NEXT: ;;#ASMSTART
4115 ; SI-NEXT: ;;#ASMEND
4116 ; SI-NEXT: s_setpc_b64 s[30:31]
4118 ; VI-LABEL: v_fneg_inlineasm_f16:
4120 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4121 ; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
4122 ; VI-NEXT: ;;#ASMSTART
4124 ; VI-NEXT: ;;#ASMEND
4125 ; VI-NEXT: s_setpc_b64 s[30:31]
4127 ; GFX11-LABEL: v_fneg_inlineasm_f16:
4129 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4130 ; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
4131 ; GFX11-NEXT: ;;#ASMSTART
4132 ; GFX11-NEXT: ; use v0
4133 ; GFX11-NEXT: ;;#ASMEND
4134 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4135 %mul = fmul half %a, %b
4136 %fneg = fneg half %mul
4137 call void asm sideeffect "; use $0", "v"(half %fneg) #0
4141 ; --------------------------------------------------------------------------------
4143 ; --------------------------------------------------------------------------------
4145 ; Can't fold into use, so should fold into source
4146 define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, i32 %d) #0 {
4147 ; SI-LABEL: v_fneg_inlineasm_multi_use_src_f16:
4149 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4150 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
4151 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v3
4152 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4153 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
4154 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4155 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0
4156 ; SI-NEXT: ;;#ASMSTART
4158 ; SI-NEXT: ;;#ASMEND
4159 ; SI-NEXT: s_setpc_b64 s[30:31]
4161 ; VI-LABEL: v_fneg_inlineasm_multi_use_src_f16:
4163 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4164 ; VI-NEXT: v_mul_f16_e32 v0, v2, v3
4165 ; VI-NEXT: v_xor_b32_e32 v1, 0x8000, v0
4166 ; VI-NEXT: ;;#ASMSTART
4168 ; VI-NEXT: ;;#ASMEND
4169 ; VI-NEXT: s_setpc_b64 s[30:31]
4171 ; GFX11-LABEL: v_fneg_inlineasm_multi_use_src_f16:
4173 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4174 ; GFX11-NEXT: v_mul_f16_e32 v0, v2, v3
4175 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4176 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
4177 ; GFX11-NEXT: ;;#ASMSTART
4178 ; GFX11-NEXT: ; use v1
4179 ; GFX11-NEXT: ;;#ASMEND
4180 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4181 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4182 %tid.ext = sext i32 %tid to i64
4183 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4184 %mul = fmul half %a, %b
4185 %fneg = fneg half %mul
4186 call void asm sideeffect "; use $0", "v"(half %fneg) #0
4190 ; --------------------------------------------------------------------------------
4191 ; code size regression tests
4192 ; --------------------------------------------------------------------------------
4194 ; There are multiple users of the fneg that must use a VOP3
4195 ; instruction, so there is no penalty
4196 define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c) #0 {
4197 ; SI-LABEL: multiuse_fneg_2_vop3_users_f16:
4199 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4200 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4201 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4202 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4203 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4204 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4205 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
4206 ; SI-NEXT: v_fma_f32 v0, -v3, v1, v2
4207 ; SI-NEXT: v_fma_f32 v1, -v3, v2, 2.0
4208 ; SI-NEXT: s_setpc_b64 s[30:31]
4210 ; VI-LABEL: multiuse_fneg_2_vop3_users_f16:
4212 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4213 ; VI-NEXT: v_fma_f16 v3, -v0, v1, v2
4214 ; VI-NEXT: v_fma_f16 v1, -v0, v2, 2.0
4215 ; VI-NEXT: v_mov_b32_e32 v0, v3
4216 ; VI-NEXT: s_setpc_b64 s[30:31]
4218 ; GFX11-LABEL: multiuse_fneg_2_vop3_users_f16:
4220 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4221 ; GFX11-NEXT: v_fma_f16 v3, -v0, v1, v2
4222 ; GFX11-NEXT: v_fma_f16 v1, -v0, v2, 2.0
4223 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
4224 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
4225 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4226 %fneg.a = fneg half %a
4227 %fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
4228 %fma1 = call half @llvm.fma.f16(half %fneg.a, half %c, half 2.0)
4229 %insert.0 = insertvalue { half, half } poison, half %fma0, 0
4230 %insert.1 = insertvalue { half, half } %insert.0, half %fma1, 1
4231 ret { half, half } %insert.1
4234 ; There are multiple users, but both require using a larger encoding
4236 define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c) #0 {
4237 ; SI-LABEL: multiuse_fneg_2_vop2_users_f16:
4239 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4240 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4241 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
4242 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4243 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4244 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
4245 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4246 ; SI-NEXT: v_mul_f32_e32 v0, v3, v1
4247 ; SI-NEXT: v_mul_f32_e32 v1, v3, v2
4248 ; SI-NEXT: s_setpc_b64 s[30:31]
4250 ; VI-LABEL: multiuse_fneg_2_vop2_users_f16:
4252 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4253 ; VI-NEXT: v_mul_f16_e64 v3, -v0, v1
4254 ; VI-NEXT: v_mul_f16_e64 v1, -v0, v2
4255 ; VI-NEXT: v_mov_b32_e32 v0, v3
4256 ; VI-NEXT: s_setpc_b64 s[30:31]
4258 ; GFX11-LABEL: multiuse_fneg_2_vop2_users_f16:
4260 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4261 ; GFX11-NEXT: v_mul_f16_e64 v3, -v0, v1
4262 ; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2
4263 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
4264 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
4265 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4266 %fneg.a = fneg half %a
4267 %mul0 = fmul half %fneg.a, %b
4268 %mul1 = fmul half %fneg.a, %c
4269 %insert.0 = insertvalue { half, half } poison, half %mul0, 0
4270 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1
4271 ret { half, half } %insert.1
4274 ; One user is VOP3 so has no cost to folding the modifier, the other does.
4275 define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out, half %a, half %b, half %c) #0 {
4276 ; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
4278 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4279 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v3
4280 ; SI-NEXT: v_cvt_f16_f32_e64 v1, -v2
4281 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v4
4282 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4283 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4284 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4285 ; SI-NEXT: v_fma_f32 v0, v1, v0, 2.0
4286 ; SI-NEXT: v_mul_f32_e32 v1, v1, v2
4287 ; SI-NEXT: s_setpc_b64 s[30:31]
4289 ; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
4291 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4292 ; VI-NEXT: v_fma_f16 v0, -v2, v3, 2.0
4293 ; VI-NEXT: v_mul_f16_e64 v1, -v2, v4
4294 ; VI-NEXT: s_setpc_b64 s[30:31]
4296 ; GFX11-LABEL: multiuse_fneg_vop2_vop3_users_f16:
4298 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4299 ; GFX11-NEXT: v_fma_f16 v0, -v2, v3, 2.0
4300 ; GFX11-NEXT: v_mul_f16_e64 v1, -v2, v4
4301 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4302 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4303 %tid.ext = sext i32 %tid to i64
4304 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4306 %fneg.a = fneg half %a
4307 %fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half 2.0)
4308 %mul1 = fmul half %fneg.a, %c
4310 %insert.0 = insertvalue { half, half } poison, half %fma0, 0
4311 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1
4312 ret { half, half } %insert.1
4315 ; The use of the fneg requires a code size increase, but folding into
4316 ; the source does not
4317 define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
4318 ; SI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
4320 ; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4321 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v3
4322 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2
4323 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v4
4324 ; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v5
4325 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
4326 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
4327 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
4328 ; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
4329 ; SI-SAFE-NEXT: v_fma_f32 v1, v1, v0, 2.0
4330 ; SI-SAFE-NEXT: v_mul_f32_e64 v0, -v1, v2
4331 ; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
4332 ; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
4334 ; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
4336 ; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4337 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v3
4338 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2
4339 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v4
4340 ; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v5
4341 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
4342 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
4343 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
4344 ; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3
4345 ; SI-NSZ-NEXT: v_fma_f32 v1, v1, -v0, -2.0
4346 ; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v2
4347 ; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3
4348 ; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
4350 ; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
4352 ; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4353 ; VI-SAFE-NEXT: v_fma_f16 v1, v2, v3, 2.0
4354 ; VI-SAFE-NEXT: v_mul_f16_e64 v0, -v1, v4
4355 ; VI-SAFE-NEXT: v_mul_f16_e64 v1, -v1, v5
4356 ; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
4358 ; VI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
4360 ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4361 ; VI-NSZ-NEXT: v_fma_f16 v1, v2, -v3, -2.0
4362 ; VI-NSZ-NEXT: v_mul_f16_e32 v0, v1, v4
4363 ; VI-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5
4364 ; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
4366 ; GFX11-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
4367 ; GFX11-SAFE: ; %bb.0:
4368 ; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4369 ; GFX11-SAFE-NEXT: v_fma_f16 v1, v2, v3, 2.0
4370 ; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
4371 ; GFX11-SAFE-NEXT: v_mul_f16_e64 v0, -v1, v4
4372 ; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v1, v5
4373 ; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
4375 ; GFX11-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
4376 ; GFX11-NSZ: ; %bb.0:
4377 ; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4378 ; GFX11-NSZ-NEXT: v_fma_f16 v1, v2, -v3, -2.0
4379 ; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
4380 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, v1, v4
4381 ; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5
4382 ; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
4383 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4384 %tid.ext = sext i32 %tid to i64
4385 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4387 %fma0 = call half @llvm.fma.f16(half %a, half %b, half 2.0)
4388 %fneg.fma0 = fneg half %fma0
4389 %mul1 = fmul half %fneg.fma0, %c
4390 %mul2 = fmul half %fneg.fma0, %d
4392 %insert.0 = insertvalue { half, half } poison, half %mul1, 0
4393 %insert.1 = insertvalue { half, half } %insert.0, half %mul2, 1
4394 ret { half, half } %insert.1
4397 ; %trunc.a has one fneg use, but it requires a code size increase and
4398 ; %the fneg can instead be folded for free into the fma.
4399 define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
4400 ; SI-LABEL: one_use_cost_to_fold_into_src_f16:
4402 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4403 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
4404 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
4405 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
4406 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4407 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4408 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4409 ; SI-NEXT: v_trunc_f32_e32 v1, v1
4410 ; SI-NEXT: v_fma_f32 v0, -v1, v2, v0
4411 ; SI-NEXT: s_setpc_b64 s[30:31]
4413 ; VI-LABEL: one_use_cost_to_fold_into_src_f16:
4415 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4416 ; VI-NEXT: v_trunc_f16_e32 v0, v2
4417 ; VI-NEXT: v_fma_f16 v0, -v0, v3, v4
4418 ; VI-NEXT: s_setpc_b64 s[30:31]
4420 ; GFX11-LABEL: one_use_cost_to_fold_into_src_f16:
4422 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4423 ; GFX11-NEXT: v_trunc_f16_e32 v0, v2
4424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4425 ; GFX11-NEXT: v_fma_f16 v0, -v0, v3, v4
4426 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4427 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4428 %tid.ext = sext i32 %tid to i64
4429 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4431 %trunc.a = call half @llvm.trunc.f16(half %a)
4432 %trunc.fneg.a = fneg half %trunc.a
4433 %fma0 = call half @llvm.fma.f16(half %trunc.fneg.a, half %b, half %c)
4437 define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, half %a, half %b, half %c, half %d) #0 {
4438 ; SI-LABEL: multi_use_cost_to_fold_into_src:
4440 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4441 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
4442 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
4443 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
4444 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
4445 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4446 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4447 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4448 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
4449 ; SI-NEXT: v_trunc_f32_e32 v1, v1
4450 ; SI-NEXT: v_fma_f32 v0, -v1, v2, v0
4451 ; SI-NEXT: v_mul_f32_e32 v1, v1, v3
4452 ; SI-NEXT: s_setpc_b64 s[30:31]
4454 ; VI-LABEL: multi_use_cost_to_fold_into_src:
4456 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4457 ; VI-NEXT: v_trunc_f16_e32 v1, v2
4458 ; VI-NEXT: v_fma_f16 v0, -v1, v3, v4
4459 ; VI-NEXT: v_mul_f16_e32 v1, v1, v5
4460 ; VI-NEXT: s_setpc_b64 s[30:31]
4462 ; GFX11-LABEL: multi_use_cost_to_fold_into_src:
4464 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4465 ; GFX11-NEXT: v_trunc_f16_e32 v1, v2
4466 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4467 ; GFX11-NEXT: v_fma_f16 v0, -v1, v3, v4
4468 ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v5
4469 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4470 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4471 %tid.ext = sext i32 %tid to i64
4472 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
4473 %trunc.a = call half @llvm.trunc.f16(half %a)
4474 %trunc.fneg.a = fneg half %trunc.a
4475 %fma0 = call half @llvm.fma.f16(half %trunc.fneg.a, half %b, half %c)
4476 %mul1 = fmul half %trunc.a, %d
4477 %insert.0 = insertvalue { half, half } poison, half %fma0, 0
4478 %insert.1 = insertvalue { half, half } %insert.0, half %mul1, 1
4479 ret { half, half } %insert.1
4482 ; The AMDGPU combine to pull fneg into the FMA operands was being
4483 ; undone by the generic combine to pull the fneg out of the fma if
4484 ; !isFNegFree. We were reporting false for v2f32 even though it will
4485 ; be split into f32 where it will be free.
4486 define <2 x half> @fneg_fma_fneg_dagcombine_loop(<2 x half> %arg, <2 x half> %arg1, <2 x half> %arg2) #0 {
4487 ; SI-LABEL: fneg_fma_fneg_dagcombine_loop:
4488 ; SI: ; %bb.0: ; %bb
4489 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4490 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
4491 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
4492 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4493 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
4494 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5
4495 ; SI-NEXT: v_or_b32_e32 v6, v4, v6
4496 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4497 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4498 ; SI-NEXT: v_xor_b32_e32 v6, 0x80008000, v6
4499 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
4500 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4501 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
4502 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
4503 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
4504 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4505 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4506 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
4507 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
4508 ; SI-NEXT: s_brev_b32 s4, 1
4509 ; SI-NEXT: v_fma_f32 v3, v3, v7, s4
4510 ; SI-NEXT: v_fma_f32 v2, v2, v6, s4
4511 ; SI-NEXT: v_sub_f32_e32 v1, v3, v1
4512 ; SI-NEXT: v_sub_f32_e32 v0, v2, v0
4513 ; SI-NEXT: v_mul_f32_e32 v0, v0, v4
4514 ; SI-NEXT: v_mul_f32_e32 v1, v1, v5
4515 ; SI-NEXT: s_setpc_b64 s[30:31]
4517 ; VI-LABEL: fneg_fma_fneg_dagcombine_loop:
4518 ; VI: ; %bb.0: ; %bb
4519 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4520 ; VI-NEXT: s_mov_b32 s4, 0x8000
4521 ; VI-NEXT: v_fma_f16 v3, v1, -v2, s4
4522 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
4523 ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
4524 ; VI-NEXT: v_fma_f16 v1, v1, -v4, s4
4525 ; VI-NEXT: v_sub_f16_e32 v3, v3, v0
4526 ; VI-NEXT: v_sub_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4527 ; VI-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4528 ; VI-NEXT: v_mul_f16_e32 v1, v3, v2
4529 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
4530 ; VI-NEXT: s_setpc_b64 s[30:31]
4532 ; GFX11-LABEL: fneg_fma_fneg_dagcombine_loop:
4533 ; GFX11: ; %bb.0: ; %bb
4534 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4535 ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v2, 0x8000 op_sel_hi:[1,1,0] neg_lo:[0,1,0] neg_hi:[0,1,0]
4536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4537 ; GFX11-NEXT: v_pk_add_f16 v0, v1, v0 neg_lo:[0,1] neg_hi:[0,1]
4538 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2
4539 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4541 %i3 = call fast <2 x half> @llvm.fma.v2f16(<2 x half> %arg1, <2 x half> %arg2, <2 x half> zeroinitializer)
4542 %i4 = fadd fast <2 x half> %i3, %arg
4543 %i5 = fneg <2 x half> %i4
4544 %i6 = fmul fast <2 x half> %i5, %arg2
4548 ; This expects denormal flushing, so can't turn this fmul into fneg
4549 ; TODO: Keeping this as fmul saves encoding size
4550 define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 {
4551 ; SI-LABEL: nnan_fmul_neg1_to_fneg:
4553 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4554 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4555 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
4556 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4557 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4558 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4559 ; SI-NEXT: s_setpc_b64 s[30:31]
4561 ; VI-LABEL: nnan_fmul_neg1_to_fneg:
4563 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4564 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
4565 ; VI-NEXT: s_setpc_b64 s[30:31]
4567 ; GFX11-LABEL: nnan_fmul_neg1_to_fneg:
4569 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4570 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
4571 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4572 %mul = fmul half %x, -1.0
4573 %add = fmul nnan half %mul, %y
4577 ; It's legal to turn this fmul into an fneg since denormals are
4578 ; preserved and we know an snan can't happen from the flag.
4579 define half @denormal_fmul_neg1_to_fneg(half %x, half %y) {
4580 ; SI-LABEL: denormal_fmul_neg1_to_fneg:
4582 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4583 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4584 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
4585 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4586 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4587 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4588 ; SI-NEXT: s_setpc_b64 s[30:31]
4590 ; VI-LABEL: denormal_fmul_neg1_to_fneg:
4592 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4593 ; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
4594 ; VI-NEXT: s_setpc_b64 s[30:31]
4596 ; GFX11-LABEL: denormal_fmul_neg1_to_fneg:
4598 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4599 ; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
4600 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4601 %mul = fmul nnan half %x, -1.0
4602 %add = fmul half %mul, %y
4606 ; know the source can't be an snan
4607 define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
4608 ; SI-LABEL: denorm_snan_fmul_neg1_to_fneg:
4610 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4611 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4612 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4613 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
4614 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
4615 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4616 ; SI-NEXT: v_mul_f32_e32 v0, v2, v0
4617 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4618 ; SI-NEXT: s_setpc_b64 s[30:31]
4620 ; VI-LABEL: denorm_snan_fmul_neg1_to_fneg:
4622 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4623 ; VI-NEXT: v_mul_f16_e64 v0, v0, -v0
4624 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1
4625 ; VI-NEXT: s_setpc_b64 s[30:31]
4627 ; GFX11-LABEL: denorm_snan_fmul_neg1_to_fneg:
4629 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4630 ; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v0
4631 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4632 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
4633 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4634 %canonical = fmul half %x, %x
4635 %mul = fmul half %canonical, -1.0
4636 %add = fmul half %mul, %y
4640 define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 {
4641 ; SI-LABEL: flush_snan_fmul_neg1_to_fneg:
4643 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4644 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4645 ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
4646 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4647 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4648 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1
4649 ; SI-NEXT: s_setpc_b64 s[30:31]
4651 ; VI-LABEL: flush_snan_fmul_neg1_to_fneg:
4653 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4654 ; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
4655 ; VI-NEXT: v_mul_f16_e32 v0, v0, v1
4656 ; VI-NEXT: s_setpc_b64 s[30:31]
4658 ; GFX11-LABEL: flush_snan_fmul_neg1_to_fneg:
4660 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4661 ; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
4662 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4663 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
4664 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4665 %quiet = call half @llvm.canonicalize.f16(half %x)
4666 %mul = fmul half %quiet, -1.0
4667 %add = fmul half %mul, %y
4671 define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
4672 ; SI-LABEL: fadd_select_fneg_fneg_f16:
4674 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4675 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4676 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4677 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
4678 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4679 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4680 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4681 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
4682 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4683 ; SI-NEXT: v_sub_f32_e32 v0, v3, v0
4684 ; SI-NEXT: s_setpc_b64 s[30:31]
4686 ; VI-LABEL: fadd_select_fneg_fneg_f16:
4688 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4689 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4690 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4691 ; VI-NEXT: v_sub_f16_e32 v0, v3, v0
4692 ; VI-NEXT: s_setpc_b64 s[30:31]
4694 ; GFX11-LABEL: fadd_select_fneg_fneg_f16:
4696 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4697 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4698 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
4699 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4700 ; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
4701 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4702 %cmp = icmp eq i32 %arg0, 0
4703 %neg.x = fneg half %x
4704 %neg.y = fneg half %y
4705 %select = select i1 %cmp, half %neg.x, half %neg.y
4706 %add = fadd half %select, %z
4710 ; FIXME: Terrible code for SI
4711 define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) {
4712 ; SI-LABEL: fadd_select_fneg_fneg_v2f16:
4714 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4715 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
4716 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
4717 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
4718 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
4719 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4720 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
4721 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v4
4722 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6
4723 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4724 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4725 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
4726 ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4727 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
4728 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
4729 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v5
4730 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4731 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4732 ; SI-NEXT: v_sub_f32_e32 v0, v4, v0
4733 ; SI-NEXT: v_sub_f32_e32 v1, v3, v1
4734 ; SI-NEXT: s_setpc_b64 s[30:31]
4736 ; VI-LABEL: fadd_select_fneg_fneg_v2f16:
4738 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4739 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4740 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
4741 ; VI-NEXT: v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4742 ; VI-NEXT: v_sub_f16_e32 v0, v3, v0
4743 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
4744 ; VI-NEXT: s_setpc_b64 s[30:31]
4746 ; GFX11-LABEL: fadd_select_fneg_fneg_v2f16:
4748 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4749 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4750 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
4751 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4752 ; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
4753 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4754 %cmp = icmp eq i32 %arg0, 0
4755 %neg.x = fneg <2 x half> %x
4756 %neg.y = fneg <2 x half> %y
4757 %select = select i1 %cmp, <2 x half> %neg.x, <2 x half> %neg.y
4758 %add = fadd <2 x half> %select, %z
4762 declare i32 @llvm.amdgcn.workitem.id.x() #1
4763 declare half @llvm.sin.f16(half) #1
4764 declare half @llvm.trunc.f16(half) #1
4765 declare half @llvm.round.f16(half) #1
4766 declare half @llvm.rint.f16(half) #1
4767 declare half @llvm.nearbyint.f16(half) #1
4768 declare half @llvm.roundeven.f16(half) #1
4769 declare half @llvm.canonicalize.f16(half) #1
4770 declare half @llvm.minnum.f16(half, half) #1
4771 declare half @llvm.maxnum.f16(half, half) #1
4772 declare half @llvm.fma.f16(half, half, half) #1
4773 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>)
4774 declare half @llvm.fmuladd.f16(half, half, half) #1
4775 declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1
4777 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
4778 attributes #1 = { nounwind readnone }
4779 attributes #2 = { nounwind "unsafe-fp-math"="true" }
4780 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
4781 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }