1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
7 define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
8 ; CI-LABEL: fneg_fabs_fadd_f16:
10 ; CI-NEXT: s_load_dword s0, s[8:9], 0x2
11 ; CI-NEXT: s_waitcnt lgkmcnt(0)
12 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
13 ; CI-NEXT: s_lshr_b32 s0, s0, 16
14 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
15 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
16 ; CI-NEXT: v_sub_f32_e32 v0, v1, v0
17 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
18 ; CI-NEXT: s_waitcnt lgkmcnt(0)
19 ; CI-NEXT: v_mov_b32_e32 v0, s0
20 ; CI-NEXT: v_mov_b32_e32 v1, s1
21 ; CI-NEXT: flat_store_short v[0:1], v2
24 ; VI-LABEL: fneg_fabs_fadd_f16:
26 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_lshr_b32 s3, s2, 16
30 ; VI-NEXT: v_mov_b32_e32 v0, s2
31 ; VI-NEXT: v_sub_f16_e64 v2, s3, |v0|
32 ; VI-NEXT: v_mov_b32_e32 v0, s0
33 ; VI-NEXT: v_mov_b32_e32 v1, s1
34 ; VI-NEXT: flat_store_short v[0:1], v2
37 ; GFX9-LABEL: fneg_fabs_fadd_f16:
39 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
40 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
41 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
43 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
44 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
45 ; GFX9-NEXT: v_sub_f16_e64 v1, s3, |v1|
46 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
49 ; GFX11-LABEL: fneg_fabs_fadd_f16:
51 ; GFX11-NEXT: s_clause 0x1
52 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
53 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
54 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
55 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
57 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
58 ; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2|
59 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
60 ; GFX11-NEXT: s_endpgm
61 %fabs = call half @llvm.fabs.f16(half %x)
62 %fsub = fsub half -0.0, %fabs
63 %fadd = fadd half %y, %fsub
64 store half %fadd, ptr addrspace(1) %out, align 2
68 define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, half %y) {
69 ; CI-LABEL: fneg_fabs_fmul_f16:
71 ; CI-NEXT: s_load_dword s0, s[8:9], 0x2
72 ; CI-NEXT: s_waitcnt lgkmcnt(0)
73 ; CI-NEXT: s_and_b32 s1, s0, 0x7fff
74 ; CI-NEXT: s_lshr_b32 s0, s0, 16
75 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
76 ; CI-NEXT: v_cvt_f32_f16_e64 v1, -|s1|
77 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
78 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1
79 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
80 ; CI-NEXT: s_waitcnt lgkmcnt(0)
81 ; CI-NEXT: v_mov_b32_e32 v0, s0
82 ; CI-NEXT: v_mov_b32_e32 v1, s1
83 ; CI-NEXT: flat_store_short v[0:1], v2
86 ; VI-LABEL: fneg_fabs_fmul_f16:
88 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
89 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
90 ; VI-NEXT: s_waitcnt lgkmcnt(0)
91 ; VI-NEXT: s_lshr_b32 s3, s2, 16
92 ; VI-NEXT: v_mov_b32_e32 v0, s2
93 ; VI-NEXT: v_mul_f16_e64 v2, s3, -|v0|
94 ; VI-NEXT: v_mov_b32_e32 v0, s0
95 ; VI-NEXT: v_mov_b32_e32 v1, s1
96 ; VI-NEXT: flat_store_short v[0:1], v2
99 ; GFX9-LABEL: fneg_fabs_fmul_f16:
101 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
102 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
103 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
104 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
106 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
107 ; GFX9-NEXT: v_mul_f16_e64 v1, s3, -|v1|
108 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
109 ; GFX9-NEXT: s_endpgm
111 ; GFX11-LABEL: fneg_fabs_fmul_f16:
113 ; GFX11-NEXT: s_clause 0x1
114 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
115 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
116 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
117 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
118 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
119 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
120 ; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2|
121 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
122 ; GFX11-NEXT: s_endpgm
123 %fabs = call half @llvm.fabs.f16(half %x)
124 %fsub = fsub half -0.0, %fabs
125 %fmul = fmul half %y, %fsub
126 store half %fmul, ptr addrspace(1) %out, align 2
130 ; DAGCombiner will transform:
131 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
132 ; unless isFabsFree returns true
133 define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
134 ; CI-LABEL: fneg_fabs_free_f16:
136 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
137 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
138 ; CI-NEXT: s_waitcnt lgkmcnt(0)
139 ; CI-NEXT: s_bitset1_b32 s2, 15
140 ; CI-NEXT: v_mov_b32_e32 v0, s0
141 ; CI-NEXT: v_mov_b32_e32 v1, s1
142 ; CI-NEXT: v_mov_b32_e32 v2, s2
143 ; CI-NEXT: flat_store_short v[0:1], v2
146 ; VI-LABEL: fneg_fabs_free_f16:
148 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
149 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
150 ; VI-NEXT: s_waitcnt lgkmcnt(0)
151 ; VI-NEXT: s_bitset1_b32 s2, 15
152 ; VI-NEXT: v_mov_b32_e32 v0, s0
153 ; VI-NEXT: v_mov_b32_e32 v1, s1
154 ; VI-NEXT: v_mov_b32_e32 v2, s2
155 ; VI-NEXT: flat_store_short v[0:1], v2
158 ; GFX9-LABEL: fneg_fabs_free_f16:
160 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
161 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
162 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
163 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
164 ; GFX9-NEXT: s_bitset1_b32 s2, 15
165 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
166 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
167 ; GFX9-NEXT: s_endpgm
169 ; GFX11-LABEL: fneg_fabs_free_f16:
171 ; GFX11-NEXT: s_clause 0x1
172 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
173 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
174 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX11-NEXT: s_bitset1_b32 s2, 15
176 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
177 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
178 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
179 ; GFX11-NEXT: s_endpgm
180 %bc = bitcast i16 %in to half
181 %fabs = call half @llvm.fabs.f16(half %bc)
182 %fsub = fsub half -0.0, %fabs
183 store half %fsub, ptr addrspace(1) %out
187 define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
188 ; CI-LABEL: fneg_fabs_f16:
190 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
191 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
192 ; CI-NEXT: s_waitcnt lgkmcnt(0)
193 ; CI-NEXT: s_bitset1_b32 s2, 15
194 ; CI-NEXT: v_mov_b32_e32 v0, s0
195 ; CI-NEXT: v_mov_b32_e32 v1, s1
196 ; CI-NEXT: v_mov_b32_e32 v2, s2
197 ; CI-NEXT: flat_store_short v[0:1], v2
200 ; VI-LABEL: fneg_fabs_f16:
202 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
203 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
204 ; VI-NEXT: s_waitcnt lgkmcnt(0)
205 ; VI-NEXT: s_bitset1_b32 s2, 15
206 ; VI-NEXT: v_mov_b32_e32 v0, s0
207 ; VI-NEXT: v_mov_b32_e32 v1, s1
208 ; VI-NEXT: v_mov_b32_e32 v2, s2
209 ; VI-NEXT: flat_store_short v[0:1], v2
212 ; GFX9-LABEL: fneg_fabs_f16:
214 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
215 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
216 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
217 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
218 ; GFX9-NEXT: s_bitset1_b32 s2, 15
219 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
220 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
221 ; GFX9-NEXT: s_endpgm
223 ; GFX11-LABEL: fneg_fabs_f16:
225 ; GFX11-NEXT: s_clause 0x1
226 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
227 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
228 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
229 ; GFX11-NEXT: s_bitset1_b32 s2, 15
230 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
231 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
232 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
233 ; GFX11-NEXT: s_endpgm
234 %fabs = call half @llvm.fabs.f16(half %in)
235 %fsub = fsub half -0.0, %fabs
236 store half %fsub, ptr addrspace(1) %out, align 2
240 define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
241 ; CIVI-LABEL: v_fneg_fabs_f16:
243 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
244 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
245 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
246 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
247 ; CIVI-NEXT: flat_load_ushort v2, v[0:1]
248 ; CIVI-NEXT: v_mov_b32_e32 v0, s0
249 ; CIVI-NEXT: v_mov_b32_e32 v1, s1
250 ; CIVI-NEXT: s_waitcnt vmcnt(0)
251 ; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2
252 ; CIVI-NEXT: flat_store_short v[0:1], v2
253 ; CIVI-NEXT: s_endpgm
255 ; GFX9-LABEL: v_fneg_fabs_f16:
257 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
258 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
259 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
260 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
261 ; GFX9-NEXT: s_waitcnt vmcnt(0)
262 ; GFX9-NEXT: v_or_b32_e32 v1, 0x8000, v1
263 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
264 ; GFX9-NEXT: s_endpgm
266 ; GFX11-LABEL: v_fneg_fabs_f16:
268 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
269 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
270 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
272 ; GFX11-NEXT: s_waitcnt vmcnt(0)
273 ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
274 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
275 ; GFX11-NEXT: s_endpgm
276 %val = load half, ptr addrspace(1) %in, align 2
277 %fabs = call half @llvm.fabs.f16(half %val)
278 %fsub = fsub half -0.0, %fabs
279 store half %fsub, ptr addrspace(1) %out, align 2
283 define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
284 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
286 ; CI-NEXT: s_load_dword s0, s[8:9], 0x2
287 ; CI-NEXT: s_waitcnt lgkmcnt(0)
288 ; CI-NEXT: s_lshr_b32 s1, s0, 16
289 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
290 ; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
291 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
292 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
293 ; CI-NEXT: v_add_f32_e32 v0, 1.0, v0
294 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
295 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
296 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
297 ; CI-NEXT: v_or_b32_e32 v0, v0, v1
298 ; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
299 ; CI-NEXT: s_waitcnt lgkmcnt(0)
300 ; CI-NEXT: v_mov_b32_e32 v0, s0
301 ; CI-NEXT: v_mov_b32_e32 v1, s1
302 ; CI-NEXT: flat_store_dword v[0:1], v2
305 ; VI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
307 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
308 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
309 ; VI-NEXT: v_mov_b32_e32 v0, 0x4000
310 ; VI-NEXT: s_waitcnt lgkmcnt(0)
311 ; VI-NEXT: s_lshr_b32 s3, s2, 16
312 ; VI-NEXT: v_mov_b32_e32 v2, s3
313 ; VI-NEXT: v_add_f16_e64 v1, s2, 1.0
314 ; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
315 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
316 ; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
317 ; VI-NEXT: v_mov_b32_e32 v0, s0
318 ; VI-NEXT: v_mov_b32_e32 v1, s1
319 ; VI-NEXT: flat_store_dword v[0:1], v2
322 ; GFX9-LABEL: s_fneg_fabs_v2f16_non_bc_src:
324 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
325 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
326 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003c00
327 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
328 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX9-NEXT: v_pk_add_f16 v1, s2, v1
330 ; GFX9-NEXT: v_or_b32_e32 v1, 0x80008000, v1
331 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
332 ; GFX9-NEXT: s_endpgm
334 ; GFX11-LABEL: s_fneg_fabs_v2f16_non_bc_src:
336 ; GFX11-NEXT: s_clause 0x1
337 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
338 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
339 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
340 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
341 ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2
342 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
343 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0
344 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
345 ; GFX11-NEXT: s_endpgm
346 %add = fadd <2 x half> %in, <half 1.0, half 2.0>
347 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %add)
348 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
349 store <2 x half> %fneg.fabs, ptr addrspace(1) %out
353 ; FIXME: single bit op
354 ; Combine turns this into integer op when bitcast source (from load)
355 define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x half> %in) {
356 ; CI-LABEL: s_fneg_fabs_v2f16_bc_src:
358 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
359 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
360 ; CI-NEXT: s_waitcnt lgkmcnt(0)
361 ; CI-NEXT: s_or_b32 s2, s2, 0x80008000
362 ; CI-NEXT: v_mov_b32_e32 v0, s0
363 ; CI-NEXT: v_mov_b32_e32 v1, s1
364 ; CI-NEXT: v_mov_b32_e32 v2, s2
365 ; CI-NEXT: flat_store_dword v[0:1], v2
368 ; VI-LABEL: s_fneg_fabs_v2f16_bc_src:
370 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
371 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
372 ; VI-NEXT: s_waitcnt lgkmcnt(0)
373 ; VI-NEXT: s_or_b32 s2, s2, 0x80008000
374 ; VI-NEXT: v_mov_b32_e32 v0, s0
375 ; VI-NEXT: v_mov_b32_e32 v1, s1
376 ; VI-NEXT: v_mov_b32_e32 v2, s2
377 ; VI-NEXT: flat_store_dword v[0:1], v2
380 ; GFX9-LABEL: s_fneg_fabs_v2f16_bc_src:
382 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
383 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
384 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
385 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
387 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
388 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
389 ; GFX9-NEXT: s_endpgm
391 ; GFX11-LABEL: s_fneg_fabs_v2f16_bc_src:
393 ; GFX11-NEXT: s_clause 0x1
394 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
395 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
396 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
398 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
399 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
400 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
401 ; GFX11-NEXT: s_endpgm
402 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
403 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
404 store <2 x half> %fneg.fabs, ptr addrspace(1) %out
408 define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
409 ; CIVI-LABEL: fneg_fabs_v4f16:
411 ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
412 ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
413 ; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
414 ; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
415 ; CIVI-NEXT: v_mov_b32_e32 v3, s1
416 ; CIVI-NEXT: v_mov_b32_e32 v0, s2
417 ; CIVI-NEXT: v_mov_b32_e32 v1, s3
418 ; CIVI-NEXT: v_mov_b32_e32 v2, s0
419 ; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
420 ; CIVI-NEXT: s_endpgm
422 ; GFX9-LABEL: fneg_fabs_v4f16:
424 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
425 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
426 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
427 ; GFX9-NEXT: s_or_b32 s3, s3, 0x80008000
428 ; GFX9-NEXT: s_or_b32 s2, s2, 0x80008000
429 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
430 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
431 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
432 ; GFX9-NEXT: s_endpgm
434 ; GFX11-LABEL: fneg_fabs_v4f16:
436 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
437 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000
439 ; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000
440 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
441 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
442 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
443 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
444 ; GFX11-NEXT: s_endpgm
445 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
446 %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
447 store <4 x half> %fsub, ptr addrspace(1) %out
451 define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) #0 {
452 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
454 ; CI-NEXT: s_load_dword s0, s[8:9], 0x2
455 ; CI-NEXT: s_waitcnt lgkmcnt(0)
456 ; CI-NEXT: s_lshr_b32 s1, s0, 16
457 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
458 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
459 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
460 ; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1
461 ; CI-NEXT: v_mul_f32_e32 v0, -4.0, v0
462 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
463 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
464 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
465 ; CI-NEXT: v_or_b32_e32 v2, v0, v1
466 ; CI-NEXT: s_waitcnt lgkmcnt(0)
467 ; CI-NEXT: v_mov_b32_e32 v0, s0
468 ; CI-NEXT: v_mov_b32_e32 v1, s1
469 ; CI-NEXT: flat_store_dword v[0:1], v2
472 ; VI-LABEL: fold_user_fneg_fabs_v2f16:
474 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
475 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
476 ; VI-NEXT: v_mov_b32_e32 v0, 0xc400
477 ; VI-NEXT: s_waitcnt lgkmcnt(0)
478 ; VI-NEXT: s_lshr_b32 s3, s2, 16
479 ; VI-NEXT: v_mov_b32_e32 v2, s3
480 ; VI-NEXT: v_mul_f16_e64 v1, |s2|, -4.0
481 ; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
482 ; VI-NEXT: v_or_b32_e32 v2, v1, v0
483 ; VI-NEXT: v_mov_b32_e32 v0, s0
484 ; VI-NEXT: v_mov_b32_e32 v1, s1
485 ; VI-NEXT: flat_store_dword v[0:1], v2
488 ; GFX9-LABEL: fold_user_fneg_fabs_v2f16:
490 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
491 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
492 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
493 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
494 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
495 ; GFX9-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0]
496 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
497 ; GFX9-NEXT: s_endpgm
499 ; GFX11-LABEL: fold_user_fneg_fabs_v2f16:
501 ; GFX11-NEXT: s_clause 0x1
502 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
503 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
504 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
505 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
506 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
507 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
508 ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0]
509 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
510 ; GFX11-NEXT: s_endpgm
511 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
512 %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
513 %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
514 store <2 x half> %mul, ptr addrspace(1) %out
518 define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
519 ; CI-LABEL: s_fneg_multi_use_fabs_v2f16:
521 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
522 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
523 ; CI-NEXT: s_waitcnt lgkmcnt(0)
524 ; CI-NEXT: v_mov_b32_e32 v0, s0
525 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
526 ; CI-NEXT: v_mov_b32_e32 v1, s1
527 ; CI-NEXT: s_or_b32 s1, s0, 0x80008000
528 ; CI-NEXT: v_mov_b32_e32 v4, s0
529 ; CI-NEXT: v_mov_b32_e32 v2, s2
530 ; CI-NEXT: v_mov_b32_e32 v3, s3
531 ; CI-NEXT: flat_store_dword v[0:1], v4
532 ; CI-NEXT: v_mov_b32_e32 v0, s1
533 ; CI-NEXT: flat_store_dword v[2:3], v0
536 ; VI-LABEL: s_fneg_multi_use_fabs_v2f16:
538 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
539 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
540 ; VI-NEXT: s_waitcnt lgkmcnt(0)
541 ; VI-NEXT: v_mov_b32_e32 v0, s0
542 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
543 ; VI-NEXT: v_mov_b32_e32 v1, s1
544 ; VI-NEXT: s_xor_b32 s1, s0, 0x80008000
545 ; VI-NEXT: v_mov_b32_e32 v4, s0
546 ; VI-NEXT: v_mov_b32_e32 v2, s2
547 ; VI-NEXT: v_mov_b32_e32 v3, s3
548 ; VI-NEXT: flat_store_dword v[0:1], v4
549 ; VI-NEXT: v_mov_b32_e32 v0, s1
550 ; VI-NEXT: flat_store_dword v[2:3], v0
553 ; GFX9-LABEL: s_fneg_multi_use_fabs_v2f16:
555 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
556 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
557 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
558 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
559 ; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff
560 ; GFX9-NEXT: s_xor_b32 s5, s4, 0x80008000
561 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
562 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
563 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
564 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
565 ; GFX9-NEXT: s_endpgm
567 ; GFX11-LABEL: s_fneg_multi_use_fabs_v2f16:
569 ; GFX11-NEXT: s_clause 0x1
570 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
571 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
572 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
574 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
575 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
576 ; GFX11-NEXT: s_xor_b32 s5, s4, 0x80008000
577 ; GFX11-NEXT: v_mov_b32_e32 v2, s5
578 ; GFX11-NEXT: s_clause 0x1
579 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
580 ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
581 ; GFX11-NEXT: s_endpgm
582 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
583 %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
584 store <2 x half> %fabs, ptr addrspace(1) %out0
585 store <2 x half> %fneg, ptr addrspace(1) %out1
589 define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x half> %in) {
590 ; CI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
592 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
593 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
594 ; CI-NEXT: s_waitcnt lgkmcnt(0)
595 ; CI-NEXT: v_mov_b32_e32 v0, s0
596 ; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
597 ; CI-NEXT: v_cvt_f32_f16_e32 v5, s0
598 ; CI-NEXT: v_cvt_f32_f16_e64 v4, |s4|
599 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
600 ; CI-NEXT: v_mov_b32_e32 v1, s1
601 ; CI-NEXT: v_mul_f32_e32 v5, -4.0, v5
602 ; CI-NEXT: v_mul_f32_e32 v4, -4.0, v4
603 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
604 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
605 ; CI-NEXT: v_mov_b32_e32 v6, s0
606 ; CI-NEXT: v_mov_b32_e32 v2, s2
607 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
608 ; CI-NEXT: v_mov_b32_e32 v3, s3
609 ; CI-NEXT: v_or_b32_e32 v4, v4, v5
610 ; CI-NEXT: flat_store_dword v[0:1], v6
611 ; CI-NEXT: flat_store_dword v[2:3], v4
614 ; VI-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
616 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
617 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
618 ; VI-NEXT: v_mov_b32_e32 v5, 0xc400
619 ; VI-NEXT: s_waitcnt lgkmcnt(0)
620 ; VI-NEXT: v_mov_b32_e32 v1, s1
621 ; VI-NEXT: s_lshr_b32 s1, s4, 16
622 ; VI-NEXT: v_mov_b32_e32 v4, s1
623 ; VI-NEXT: v_mov_b32_e32 v0, s0
624 ; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
625 ; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
626 ; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
627 ; VI-NEXT: v_or_b32_e32 v4, v5, v4
628 ; VI-NEXT: v_mov_b32_e32 v5, s0
629 ; VI-NEXT: v_mov_b32_e32 v2, s2
630 ; VI-NEXT: v_mov_b32_e32 v3, s3
631 ; VI-NEXT: flat_store_dword v[0:1], v5
632 ; VI-NEXT: flat_store_dword v[2:3], v4
635 ; GFX9-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
637 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
638 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
639 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
640 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
641 ; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff7fff
642 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
643 ; GFX9-NEXT: v_pk_mul_f16 v1, s4, -4.0 op_sel_hi:[1,0]
644 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
645 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
646 ; GFX9-NEXT: s_endpgm
648 ; GFX11-LABEL: s_fneg_multi_use_fabs_foldable_neg_v2f16:
650 ; GFX11-NEXT: s_clause 0x1
651 ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x10
652 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
653 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX11-NEXT: s_and_b32 s4, s6, 0x7fff7fff
655 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
656 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
657 ; GFX11-NEXT: v_pk_mul_f16 v2, s4, -4.0 op_sel_hi:[1,0]
658 ; GFX11-NEXT: s_clause 0x1
659 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
660 ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
661 ; GFX11-NEXT: s_endpgm
662 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
663 %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
664 %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
665 store <2 x half> %fabs, ptr addrspace(1) %out0
666 store <2 x half> %mul, ptr addrspace(1) %out1
670 declare half @llvm.fabs.f16(half) #1
671 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
672 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
674 attributes #0 = { nounwind }
675 attributes #1 = { nounwind readnone }