1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
8 ; CI-LABEL: fneg_fabs_fadd_f16:
10 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
11 ; CI-NEXT: s_waitcnt lgkmcnt(0)
12 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
13 ; CI-NEXT: s_lshr_b32 s0, s0, 16
14 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
15 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0
17 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
18 ; CI-NEXT: s_waitcnt lgkmcnt(0)
19 ; CI-NEXT: v_mov_b32_e32 v0, s0
20 ; CI-NEXT: v_mov_b32_e32 v1, s1
21 ; CI-NEXT: flat_store_short v[0:1], v2
24 ; VI-LABEL: fneg_fabs_fadd_f16:
26 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_lshr_b32 s3, s2, 16
30 ; VI-NEXT: v_mov_b32_e32 v0, s2
31 ; VI-NEXT: v_sub_f16_e64 v2, s3, |v0|
32 ; VI-NEXT: v_mov_b32_e32 v0, s0
33 ; VI-NEXT: v_mov_b32_e32 v1, s1
34 ; VI-NEXT: flat_store_short v[0:1], v2
37 ; GFX9-LABEL: fneg_fabs_fadd_f16:
39 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
40 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
41 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
44 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
45 ; GFX9-NEXT: v_sub_f16_e64 v1, s3, |v1|
46 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
49 ; GFX11-LABEL: fneg_fabs_fadd_f16:
51 ; GFX11-NEXT: s_clause 0x1
52 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
53 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
54 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
55 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
57 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
58 ; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2|
59 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
61 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
62 ; GFX11-NEXT: s_endpgm
63 %fabs = call half @llvm.fabs.f16(half %x)
64 %fsub = fsub half -0.0, %fabs
65 %fadd = fadd half %y, %fsub
66 store half %fadd, ptr addrspace(1) %out, align 2
70 define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) {
71 ; CI-LABEL: fneg_fabs_fmul_f16:
73 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
74 ; CI-NEXT: s_waitcnt lgkmcnt(0)
75 ; CI-NEXT: s_and_b32 s1, s0, 0x7fff
76 ; CI-NEXT: s_lshr_b32 s0, s0, 16
77 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
78 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1|
79 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
80 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1
81 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
82 ; CI-NEXT: s_waitcnt lgkmcnt(0)
83 ; CI-NEXT: v_mov_b32_e32 v0, s0
84 ; CI-NEXT: v_mov_b32_e32 v1, s1
85 ; CI-NEXT: flat_store_short v[0:1], v2
88 ; VI-LABEL: fneg_fabs_fmul_f16:
90 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
91 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
92 ; VI-NEXT: s_waitcnt lgkmcnt(0)
93 ; VI-NEXT: s_lshr_b32 s3, s2, 16
94 ; VI-NEXT: v_mov_b32_e32 v0, s2
95 ; VI-NEXT: v_mul_f16_e64 v2, s3, -|v0|
96 ; VI-NEXT: v_mov_b32_e32 v0, s0
97 ; VI-NEXT: v_mov_b32_e32 v1, s1
98 ; VI-NEXT: flat_store_short v[0:1], v2
101 ; GFX9-LABEL: fneg_fabs_fmul_f16:
103 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
104 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
105 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
106 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
107 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
108 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
109 ; GFX9-NEXT: v_mul_f16_e64 v1, s3, -|v1|
110 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
111 ; GFX9-NEXT: s_endpgm
113 ; GFX11-LABEL: fneg_fabs_fmul_f16:
115 ; GFX11-NEXT: s_clause 0x1
116 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
117 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
118 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
119 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
121 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
122 ; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2|
123 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
124 ; GFX11-NEXT: s_nop 0
125 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
126 ; GFX11-NEXT: s_endpgm
127 %fabs = call half @llvm.fabs.f16(half %x)
128 %fsub = fsub half -0.0, %fabs
129 %fmul = fmul half %y, %fsub
130 store half %fmul, ptr addrspace(1) %out, align 2
134 ; DAGCombiner will transform:
135 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
136 ; unless isFabsFree returns true
137 define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
138 ; CI-LABEL: fneg_fabs_free_f16:
140 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
141 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
142 ; CI-NEXT: s_waitcnt lgkmcnt(0)
143 ; CI-NEXT: s_bitset1_b32 s2, 15
144 ; CI-NEXT: v_mov_b32_e32 v0, s0
145 ; CI-NEXT: v_mov_b32_e32 v1, s1
146 ; CI-NEXT: v_mov_b32_e32 v2, s2
147 ; CI-NEXT: flat_store_short v[0:1], v2
150 ; VI-LABEL: fneg_fabs_free_f16:
152 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
153 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
154 ; VI-NEXT: s_waitcnt lgkmcnt(0)
155 ; VI-NEXT: s_bitset1_b32 s2, 15
156 ; VI-NEXT: v_mov_b32_e32 v0, s0
157 ; VI-NEXT: v_mov_b32_e32 v1, s1
158 ; VI-NEXT: v_mov_b32_e32 v2, s2
159 ; VI-NEXT: flat_store_short v[0:1], v2
162 ; GFX9-LABEL: fneg_fabs_free_f16:
164 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
165 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
166 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
167 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
168 ; GFX9-NEXT: s_bitset1_b32 s2, 15
169 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
170 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
171 ; GFX9-NEXT: s_endpgm
173 ; GFX11-LABEL: fneg_fabs_free_f16:
175 ; GFX11-NEXT: s_clause 0x1
176 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
177 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
178 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
179 ; GFX11-NEXT: s_bitset1_b32 s2, 15
180 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
181 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
182 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
183 ; GFX11-NEXT: s_nop 0
184 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
185 ; GFX11-NEXT: s_endpgm
186 %bc = bitcast i16 %in to half
187 %fabs = call half @llvm.fabs.f16(half %bc)
188 %fsub = fsub half -0.0, %fabs
189 store half %fsub, ptr addrspace(1) %out
193 define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
194 ; CI-LABEL: fneg_fabs_f16:
196 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
197 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
198 ; CI-NEXT: s_waitcnt lgkmcnt(0)
199 ; CI-NEXT: s_bitset1_b32 s2, 15
200 ; CI-NEXT: v_mov_b32_e32 v0, s0
201 ; CI-NEXT: v_mov_b32_e32 v1, s1
202 ; CI-NEXT: v_mov_b32_e32 v2, s2
203 ; CI-NEXT: flat_store_short v[0:1], v2
206 ; VI-LABEL: fneg_fabs_f16:
208 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
209 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
210 ; VI-NEXT: s_waitcnt lgkmcnt(0)
211 ; VI-NEXT: s_bitset1_b32 s2, 15
212 ; VI-NEXT: v_mov_b32_e32 v0, s0
213 ; VI-NEXT: v_mov_b32_e32 v1, s1
214 ; VI-NEXT: v_mov_b32_e32 v2, s2
215 ; VI-NEXT: flat_store_short v[0:1], v2
218 ; GFX9-LABEL: fneg_fabs_f16:
220 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
221 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
222 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
223 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
224 ; GFX9-NEXT: s_bitset1_b32 s2, 15
225 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
226 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
227 ; GFX9-NEXT: s_endpgm
229 ; GFX11-LABEL: fneg_fabs_f16:
231 ; GFX11-NEXT: s_clause 0x1
232 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
233 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
234 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
235 ; GFX11-NEXT: s_bitset1_b32 s2, 15
236 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
237 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
238 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
239 ; GFX11-NEXT: s_nop 0
240 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
241 ; GFX11-NEXT: s_endpgm
242 %fabs = call half @llvm.fabs.f16(half %in)
243 %fsub = fsub half -0.0, %fabs
244 store half %fsub, ptr addrspace(1) %out, align 2
248 define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
249 ; CIVI-LABEL: v_fneg_fabs_f16:
251 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
252 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
253 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
254 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
255 ; CIVI-NEXT: flat_load_ushort v2, v[0:1]
256 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
257 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
258 ; CIVI-NEXT: s_waitcnt vmcnt(0)
259 ; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2
260 ; CIVI-NEXT: flat_store_short v[0:1], v2
261 ; CIVI-NEXT: s_endpgm
263 ; GFX9-LABEL: v_fneg_fabs_f16:
265 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
266 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
267 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
269 ; GFX9-NEXT: s_waitcnt vmcnt(0)
270 ; GFX9-NEXT: v_or_b32_e32 v1, 0x8000, v1
271 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
272 ; GFX9-NEXT: s_endpgm
274 ; GFX11-LABEL: v_fneg_fabs_f16:
276 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
277 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
278 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
279 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
280 ; GFX11-NEXT: s_waitcnt vmcnt(0)
281 ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
282 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
283 ; GFX11-NEXT: s_nop 0
284 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
285 ; GFX11-NEXT: s_endpgm
286 %val = load half, ptr addrspace(1) %in, align 2
287 %fabs = call half @llvm.fabs.f16(half %val)
288 %fsub = fsub half -0.0, %fabs
289 store half %fsub, ptr addrspace(1) %out, align 2
293 define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
294 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
296 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
297 ; CI-NEXT: s_waitcnt lgkmcnt(0)
298 ; CI-NEXT: s_lshr_b32 s1, s0, 16
299 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
300 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
301 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
302 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
303 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0
304 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
305 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
306 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
307 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
308 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
309 ; CI-NEXT: s_waitcnt lgkmcnt(0)
310 ; CI-NEXT: v_mov_b32_e32 v0, s0
311 ; CI-NEXT: v_mov_b32_e32 v1, s1
312 ; CI-NEXT: flat_store_dword v[0:1], v2
315 ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
317 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
318 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
319 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000
320 ; VI-NEXT: s_waitcnt lgkmcnt(0)
321 ; VI-NEXT: s_lshr_b32 s3, s2, 16
322 ; VI-NEXT: v_mov_b32_e32 v2, s3
323 ; VI-NEXT: v_add_f16_e64 v1, s2, 1.0
324 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
325 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
326 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
327 ; VI-NEXT: v_mov_b32_e32 v0, s0
328 ; VI-NEXT: v_mov_b32_e32 v1, s1
329 ; VI-NEXT: flat_store_dword v[0:1], v2
332 ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src:
334 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
335 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
336 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00
337 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
338 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX9-NEXT: v_pk_add_f16 v1, s2, v1
340 ; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v1
341 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
342 ; GFX9-NEXT: s_endpgm
344 ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src:
346 ; GFX11-NEXT: s_clause 0x1
347 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
348 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
349 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
350 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
352 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
353 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
354 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
355 ; GFX11-NEXT: s_nop 0
356 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
357 ; GFX11-NEXT: s_endpgm
358 %add = fadd <2 x half> %in, <half 1.0, half 2.0>
359 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %add)
360 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
361 store <2 x half> %fneg.fabs, ptr addrspace(1) %out
365 ; FIXME: single bit op
366 ; Combine turns this into integer op when bitcast source (from load)
367 define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
368 ; CI-LABEL: s_fneg_fabs_v2f16_bc_src:
370 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
371 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
372 ; CI-NEXT: s_waitcnt lgkmcnt(0)
373 ; CI-NEXT: s_or_b32 s2, s2, 0x80008000
374 ; CI-NEXT: v_mov_b32_e32 v0, s0
375 ; CI-NEXT: v_mov_b32_e32 v1, s1
376 ; CI-NEXT: v_mov_b32_e32 v2, s2
377 ; CI-NEXT: flat_store_dword v[0:1], v2
380 ; VI-LABEL: s_fneg_fabs_v2f16_bc_src:
382 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
383 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
384 ; VI-NEXT: s_waitcnt lgkmcnt(0)
385 ; VI-NEXT: s_or_b32 s2, s2, 0x80008000
386 ; VI-NEXT: v_mov_b32_e32 v0, s0
387 ; VI-NEXT: v_mov_b32_e32 v1, s1
388 ; VI-NEXT: v_mov_b32_e32 v2, s2
389 ; VI-NEXT: flat_store_dword v[0:1], v2
392 ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src:
394 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
395 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
396 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
397 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
399 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
400 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
401 ; GFX9-NEXT: s_endpgm
403 ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src:
405 ; GFX11-NEXT: s_clause 0x1
406 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
407 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
408 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
409 ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
410 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
411 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
412 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
413 ; GFX11-NEXT: s_nop 0
414 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
415 ; GFX11-NEXT: s_endpgm
416 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
417 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
418 store <2 x half> %fneg.fabs, ptr addrspace(1) %out
422 define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
423 ; CIVI-LABEL: fneg_fabs_v4f16:
425 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
426 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
427 ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
428 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
429 ; CIVI-NEXT: v_mov_b32_e32 v3, s1
430 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
431 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
432 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
433 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
434 ; CIVI-NEXT: s_endpgm
436 ; GFX9-LABEL: fneg_fabs_v4f16:
438 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
439 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
440 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000
442 ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
443 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
444 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
445 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
446 ; GFX9-NEXT: s_endpgm
448 ; GFX11-LABEL: fneg_fabs_v4f16:
450 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
451 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
453 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000
454 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
455 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
456 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
457 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
458 ; GFX11-NEXT: s_nop 0
459 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
460 ; GFX11-NEXT: s_endpgm
461 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
462 %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
463 store <4 x half> %fsub, ptr addrspace(1) %out
467 define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
468 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
470 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
471 ; CI-NEXT: s_waitcnt lgkmcnt(0)
472 ; CI-NEXT: s_lshr_b32 s1, s0, 16
473 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
474 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
475 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
476 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1
477 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0
478 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
479 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
480 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
481 ; CI-NEXT: v_or_b32_e32 v2, v0, v1
482 ; CI-NEXT: s_waitcnt lgkmcnt(0)
483 ; CI-NEXT: v_mov_b32_e32 v0, s0
484 ; CI-NEXT: v_mov_b32_e32 v1, s1
485 ; CI-NEXT: flat_store_dword v[0:1], v2
488 ; VI-LABEL: fold_user_fneg_fabs_v2f16:
490 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
491 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
492 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400
493 ; VI-NEXT: s_waitcnt lgkmcnt(0)
494 ; VI-NEXT: s_lshr_b32 s3, s2, 16
495 ; VI-NEXT: v_mov_b32_e32 v2, s3
496 ; VI-NEXT: v_mul_f16_e64 v1, |s2|, -4.0
497 ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
498 ; VI-NEXT: v_or_b32_e32 v2, v1, v0
499 ; VI-NEXT: v_mov_b32_e32 v0, s0
500 ; VI-NEXT: v_mov_b32_e32 v1, s1
501 ; VI-NEXT: flat_store_dword v[0:1], v2
504 ; GFX9-LABEL: fold_user_fneg_fabs_v2f16:
506 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
507 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
508 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
509 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
510 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
511 ; GFX9-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0]
512 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
513 ; GFX9-NEXT: s_endpgm
515 ; GFX11-LABEL: fold_user_fneg_fabs_v2f16:
517 ; GFX11-NEXT: s_clause 0x1
518 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
519 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
520 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
521 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
522 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
523 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
524 ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0]
525 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
526 ; GFX11-NEXT: s_nop 0
527 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
528 ; GFX11-NEXT: s_endpgm
529 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
530 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
531 %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
532 store <2 x half> %mul, ptr addrspace(1) %out
536 define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
537 ; CI-LABEL: s_fneg_multi_use_fabs_v2f16:
539 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
540 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
541 ; CI-NEXT: s_waitcnt lgkmcnt(0)
542 ; CI-NEXT: v_mov_b32_e32 v0, s0
543 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
544 ; CI-NEXT: v_mov_b32_e32 v1, s1
545 ; CI-NEXT: s_or_b32 s1, s0, 0x80008000
546 ; CI-NEXT: v_mov_b32_e32 v4, s0
547 ; CI-NEXT: v_mov_b32_e32 v2, s2
548 ; CI-NEXT: v_mov_b32_e32 v3, s3
549 ; CI-NEXT: flat_store_dword v[0:1], v4
550 ; CI-NEXT: v_mov_b32_e32 v0, s1
551 ; CI-NEXT: flat_store_dword v[2:3], v0
554 ; VI-LABEL: s_fneg_multi_use_fabs_v2f16:
556 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
557 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
558 ; VI-NEXT: s_waitcnt lgkmcnt(0)
559 ; VI-NEXT: v_mov_b32_e32 v0, s0
560 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
561 ; VI-NEXT: v_mov_b32_e32 v1, s1
562 ; VI-NEXT: s_xor_b32 s1, s0, 0x80008000
563 ; VI-NEXT: v_mov_b32_e32 v4, s0
564 ; VI-NEXT: v_mov_b32_e32 v2, s2
565 ; VI-NEXT: v_mov_b32_e32 v3, s3
566 ; VI-NEXT: flat_store_dword v[0:1], v4
567 ; VI-NEXT: v_mov_b32_e32 v0, s1
568 ; VI-NEXT: flat_store_dword v[2:3], v0
571 ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16:
573 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
574 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
575 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
576 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff
578 ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000
579 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
580 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
581 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
582 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
583 ; GFX9-NEXT: s_endpgm
585 ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16:
587 ; GFX11-NEXT: s_clause 0x1
588 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10
589 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
590 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
591 ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff
592 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
593 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
594 ; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000
595 ; GFX11-NEXT: v_mov_b32_e32 v2, s5
596 ; GFX11-NEXT: s_clause 0x1
597 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
598 ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
599 ; GFX11-NEXT: s_nop 0
600 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
601 ; GFX11-NEXT: s_endpgm
602 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
603 %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
604 store <2 x half> %fabs, ptr addrspace(1) %out0
605 store <2 x half> %fneg, ptr addrspace(1) %out1
609 define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
610 ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
612 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
613 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
614 ; CI-NEXT: s_waitcnt lgkmcnt(0)
615 ; CI-NEXT: v_mov_b32_e32 v0, s0
616 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
617 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s0
618 ; CI-NEXT: v_cvt_f32_f16_e64 v4, |s4|
619 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
620 ; CI-NEXT: v_mov_b32_e32 v1, s1
621 ; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5
622 ; CI-NEXT: v_mul_f32_e32 v4, -4.0, v4
623 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
624 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
625 ; CI-NEXT: v_mov_b32_e32 v6, s0
626 ; CI-NEXT: v_mov_b32_e32 v2, s2
627 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
628 ; CI-NEXT: v_mov_b32_e32 v3, s3
629 ; CI-NEXT: v_or_b32_e32 v4, v4, v5
630 ; CI-NEXT: flat_store_dword v[0:1], v6
631 ; CI-NEXT: flat_store_dword v[2:3], v4
634 ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
636 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
637 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
638 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400
639 ; VI-NEXT: s_waitcnt lgkmcnt(0)
640 ; VI-NEXT: v_mov_b32_e32 v1, s1
641 ; VI-NEXT: s_lshr_b32 s1, s4, 16
642 ; VI-NEXT: v_mov_b32_e32 v4, s1
643 ; VI-NEXT: v_mov_b32_e32 v0, s0
644 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
645 ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
646 ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
647 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
648 ; VI-NEXT: v_mov_b32_e32 v5, s0
649 ; VI-NEXT: v_mov_b32_e32 v2, s2
650 ; VI-NEXT: v_mov_b32_e32 v3, s3
651 ; VI-NEXT: flat_store_dword v[0:1], v5
652 ; VI-NEXT: flat_store_dword v[2:3], v4
655 ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
657 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
658 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
659 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
660 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
661 ; GFX9-NEXT: s_and_b32 s4, s6, 0x7fff7fff
662 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
663 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0]
664 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
665 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
666 ; GFX9-NEXT: s_endpgm
668 ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
670 ; GFX11-NEXT: s_clause 0x1
671 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x10
672 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
673 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
674 ; GFX11-NEXT: s_and_b32 s4, s4, 0x7fff7fff
675 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
676 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
677 ; GFX11-NEXT: v_pk_mul_f16 v2, s4, -4.0 op_sel_hi:[1,0]
678 ; GFX11-NEXT: s_clause 0x1
679 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
680 ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
681 ; GFX11-NEXT: s_nop 0
682 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
683 ; GFX11-NEXT: s_endpgm
684 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
685 %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
686 %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
687 store <2 x half> %fabs, ptr addrspace(1) %out0
688 store <2 x half> %mul, ptr addrspace(1) %out1
692 declare half @llvm.fabs.f16(half) #1
693 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
694 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
696 attributes #0 = { nounwind }
697 attributes #1 = { nounwind readnone }