1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 ; DAGCombiner will transform:
8 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
9 ; unless isFabsFree returns true
11 define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
12 ; CI-LABEL: s_fabs_free_f16:
14 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
15 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16 ; CI-NEXT: s_waitcnt lgkmcnt(0)
17 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
18 ; CI-NEXT: v_mov_b32_e32 v0, s0
19 ; CI-NEXT: v_mov_b32_e32 v1, s1
20 ; CI-NEXT: v_mov_b32_e32 v2, s2
21 ; CI-NEXT: flat_store_short v[0:1], v2
24 ; VI-LABEL: s_fabs_free_f16:
26 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
30 ; VI-NEXT: v_mov_b32_e32 v0, s0
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: v_mov_b32_e32 v2, s2
33 ; VI-NEXT: flat_store_short v[0:1], v2
36 ; GFX9-LABEL: s_fabs_free_f16:
38 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
39 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
40 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
43 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
44 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
47 ; GFX11-LABEL: s_fabs_free_f16:
49 ; GFX11-NEXT: s_clause 0x1
50 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
51 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
52 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
54 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
55 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
56 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
58 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59 ; GFX11-NEXT: s_endpgm
60 %bc= bitcast i16 %in to half
61 %fabs = call half @llvm.fabs.f16(half %bc)
62 store half %fabs, ptr addrspace(1) %out
66 define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
67 ; CI-LABEL: s_fabs_f16:
69 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
70 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
71 ; CI-NEXT: s_waitcnt lgkmcnt(0)
72 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
73 ; CI-NEXT: v_mov_b32_e32 v0, s0
74 ; CI-NEXT: v_mov_b32_e32 v1, s1
75 ; CI-NEXT: v_mov_b32_e32 v2, s2
76 ; CI-NEXT: flat_store_short v[0:1], v2
79 ; VI-LABEL: s_fabs_f16:
81 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
83 ; VI-NEXT: s_waitcnt lgkmcnt(0)
84 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
85 ; VI-NEXT: v_mov_b32_e32 v0, s0
86 ; VI-NEXT: v_mov_b32_e32 v1, s1
87 ; VI-NEXT: v_mov_b32_e32 v2, s2
88 ; VI-NEXT: flat_store_short v[0:1], v2
91 ; GFX9-LABEL: s_fabs_f16:
93 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
94 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
95 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
96 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
98 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
99 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
100 ; GFX9-NEXT: s_endpgm
102 ; GFX11-LABEL: s_fabs_f16:
104 ; GFX11-NEXT: s_clause 0x1
105 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
106 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
107 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
108 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
109 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
110 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
111 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
112 ; GFX11-NEXT: s_nop 0
113 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
114 ; GFX11-NEXT: s_endpgm
115 %fabs = call half @llvm.fabs.f16(half %in)
116 store half %fabs, ptr addrspace(1) %out
120 define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
121 ; CI-LABEL: s_fabs_v2f16:
123 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
124 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
125 ; CI-NEXT: s_waitcnt lgkmcnt(0)
126 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
127 ; CI-NEXT: v_mov_b32_e32 v0, s0
128 ; CI-NEXT: v_mov_b32_e32 v1, s1
129 ; CI-NEXT: v_mov_b32_e32 v2, s2
130 ; CI-NEXT: flat_store_dword v[0:1], v2
133 ; VI-LABEL: s_fabs_v2f16:
135 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
136 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
137 ; VI-NEXT: s_waitcnt lgkmcnt(0)
138 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
139 ; VI-NEXT: v_mov_b32_e32 v0, s0
140 ; VI-NEXT: v_mov_b32_e32 v1, s1
141 ; VI-NEXT: v_mov_b32_e32 v2, s2
142 ; VI-NEXT: flat_store_dword v[0:1], v2
145 ; GFX9-LABEL: s_fabs_v2f16:
147 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
148 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
149 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
150 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
153 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
154 ; GFX9-NEXT: s_endpgm
156 ; GFX11-LABEL: s_fabs_v2f16:
158 ; GFX11-NEXT: s_clause 0x1
159 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
160 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
161 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
163 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
164 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
165 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
166 ; GFX11-NEXT: s_nop 0
167 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
168 ; GFX11-NEXT: s_endpgm
169 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
170 store <2 x half> %fabs, ptr addrspace(1) %out
174 define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
175 ; CI-LABEL: s_fabs_v4f16:
177 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
178 ; CI-NEXT: s_waitcnt lgkmcnt(0)
179 ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
180 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
181 ; CI-NEXT: v_mov_b32_e32 v3, s1
182 ; CI-NEXT: v_mov_b32_e32 v0, s2
183 ; CI-NEXT: v_mov_b32_e32 v1, s3
184 ; CI-NEXT: v_mov_b32_e32 v2, s0
185 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
188 ; VI-LABEL: s_fabs_v4f16:
190 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
191 ; VI-NEXT: s_waitcnt lgkmcnt(0)
192 ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
193 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
194 ; VI-NEXT: v_mov_b32_e32 v3, s1
195 ; VI-NEXT: v_mov_b32_e32 v0, s2
196 ; VI-NEXT: v_mov_b32_e32 v1, s3
197 ; VI-NEXT: v_mov_b32_e32 v2, s0
198 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
201 ; GFX9-LABEL: s_fabs_v4f16:
203 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
204 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff
207 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
208 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
209 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
210 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
211 ; GFX9-NEXT: s_endpgm
213 ; GFX11-LABEL: s_fabs_v4f16:
215 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
216 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
218 ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
219 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
220 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
221 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
222 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
223 ; GFX11-NEXT: s_nop 0
224 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
225 ; GFX11-NEXT: s_endpgm
226 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
227 store <4 x half> %fabs, ptr addrspace(1) %out
231 define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
232 ; CI-LABEL: fabs_fold_f16:
234 ; CI-NEXT: s_load_dword s0, s[4:5], 0x2
235 ; CI-NEXT: s_waitcnt lgkmcnt(0)
236 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
237 ; CI-NEXT: s_lshr_b32 s0, s0, 16
238 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
239 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
240 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1
241 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
242 ; CI-NEXT: s_waitcnt lgkmcnt(0)
243 ; CI-NEXT: v_mov_b32_e32 v0, s0
244 ; CI-NEXT: v_mov_b32_e32 v1, s1
245 ; CI-NEXT: flat_store_short v[0:1], v2
248 ; VI-LABEL: fabs_fold_f16:
250 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
251 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
252 ; VI-NEXT: s_waitcnt lgkmcnt(0)
253 ; VI-NEXT: s_lshr_b32 s3, s2, 16
254 ; VI-NEXT: v_mov_b32_e32 v0, s3
255 ; VI-NEXT: v_mul_f16_e64 v2, |s2|, v0
256 ; VI-NEXT: v_mov_b32_e32 v0, s0
257 ; VI-NEXT: v_mov_b32_e32 v1, s1
258 ; VI-NEXT: flat_store_short v[0:1], v2
261 ; GFX9-LABEL: fabs_fold_f16:
263 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
264 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
265 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
268 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
269 ; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1
270 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
271 ; GFX9-NEXT: s_endpgm
273 ; GFX11-LABEL: fabs_fold_f16:
275 ; GFX11-NEXT: s_clause 0x1
276 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
277 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
278 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
279 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
281 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
282 ; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3
283 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
284 ; GFX11-NEXT: s_nop 0
285 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
286 ; GFX11-NEXT: s_endpgm
287 %fabs = call half @llvm.fabs.f16(half %in0)
288 %fmul = fmul half %fabs, %in1
289 store half %fmul, ptr addrspace(1) %out
293 define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
294 ; CI-LABEL: v_fabs_v2f16:
296 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
297 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
298 ; CI-NEXT: s_waitcnt lgkmcnt(0)
299 ; CI-NEXT: v_mov_b32_e32 v1, s1
300 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
301 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
302 ; CI-NEXT: flat_load_dword v2, v[0:1]
303 ; CI-NEXT: s_waitcnt vmcnt(0)
304 ; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
305 ; CI-NEXT: flat_store_dword v[0:1], v2
308 ; VI-LABEL: v_fabs_v2f16:
310 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
311 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312 ; VI-NEXT: s_waitcnt lgkmcnt(0)
313 ; VI-NEXT: v_mov_b32_e32 v1, s1
314 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
315 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
316 ; VI-NEXT: flat_load_dword v2, v[0:1]
317 ; VI-NEXT: s_waitcnt vmcnt(0)
318 ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
319 ; VI-NEXT: flat_store_dword v[0:1], v2
322 ; GFX9-LABEL: v_fabs_v2f16:
324 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
325 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
326 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
327 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
328 ; GFX9-NEXT: s_waitcnt vmcnt(0)
329 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
330 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
331 ; GFX9-NEXT: s_endpgm
333 ; GFX11-LABEL: v_fabs_v2f16:
335 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
336 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
337 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
338 ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
339 ; GFX11-NEXT: s_waitcnt vmcnt(0)
340 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
341 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
342 ; GFX11-NEXT: s_nop 0
343 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
344 ; GFX11-NEXT: s_endpgm
345 %tid = call i32 @llvm.amdgcn.workitem.id.x()
346 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
347 %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
348 %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
349 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
350 store <2 x half> %fabs, ptr addrspace(1) %gep.out
354 define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
355 ; CI-LABEL: fabs_free_v2f16:
357 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
358 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
359 ; CI-NEXT: s_waitcnt lgkmcnt(0)
360 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
361 ; CI-NEXT: v_mov_b32_e32 v0, s0
362 ; CI-NEXT: v_mov_b32_e32 v1, s1
363 ; CI-NEXT: v_mov_b32_e32 v2, s2
364 ; CI-NEXT: flat_store_dword v[0:1], v2
367 ; VI-LABEL: fabs_free_v2f16:
369 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
370 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
371 ; VI-NEXT: s_waitcnt lgkmcnt(0)
372 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
373 ; VI-NEXT: v_mov_b32_e32 v0, s0
374 ; VI-NEXT: v_mov_b32_e32 v1, s1
375 ; VI-NEXT: v_mov_b32_e32 v2, s2
376 ; VI-NEXT: flat_store_dword v[0:1], v2
379 ; GFX9-LABEL: fabs_free_v2f16:
381 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
382 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
383 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
384 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
386 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
387 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
388 ; GFX9-NEXT: s_endpgm
390 ; GFX11-LABEL: fabs_free_v2f16:
392 ; GFX11-NEXT: s_clause 0x1
393 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
394 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
395 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
397 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
398 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
399 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
400 ; GFX11-NEXT: s_nop 0
401 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402 ; GFX11-NEXT: s_endpgm
403 %bc = bitcast i32 %in to <2 x half>
404 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
405 store <2 x half> %fabs, ptr addrspace(1) %out
409 ; FIXME: Should do fabs after conversion to avoid converting multiple
410 ; times in this particular case.
411 define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
412 ; CI-LABEL: v_fabs_fold_self_v2f16:
414 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
415 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
416 ; CI-NEXT: s_waitcnt lgkmcnt(0)
417 ; CI-NEXT: v_mov_b32_e32 v1, s3
418 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
419 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
420 ; CI-NEXT: flat_load_dword v0, v[0:1]
421 ; CI-NEXT: s_waitcnt vmcnt(0)
422 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
423 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
424 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
425 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
426 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
427 ; CI-NEXT: v_mul_f32_e32 v1, v1, v2
428 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
429 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
430 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
431 ; CI-NEXT: v_mov_b32_e32 v0, s0
432 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
433 ; CI-NEXT: v_mov_b32_e32 v1, s1
434 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
435 ; CI-NEXT: flat_store_dword v[0:1], v2
438 ; VI-LABEL: v_fabs_fold_self_v2f16:
440 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
441 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
442 ; VI-NEXT: s_waitcnt lgkmcnt(0)
443 ; VI-NEXT: v_mov_b32_e32 v1, s3
444 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
445 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
446 ; VI-NEXT: flat_load_dword v2, v[0:1]
447 ; VI-NEXT: v_mov_b32_e32 v0, s0
448 ; VI-NEXT: v_mov_b32_e32 v1, s1
449 ; VI-NEXT: s_waitcnt vmcnt(0)
450 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
451 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, v2
452 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
453 ; VI-NEXT: flat_store_dword v[0:1], v2
456 ; GFX9-LABEL: v_fabs_fold_self_v2f16:
458 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
459 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
460 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
461 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
462 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
463 ; GFX9-NEXT: s_waitcnt vmcnt(0)
464 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0
465 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
466 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
467 ; GFX9-NEXT: s_endpgm
469 ; GFX11-LABEL: v_fabs_fold_self_v2f16:
471 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
472 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
473 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
474 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
476 ; GFX11-NEXT: s_waitcnt vmcnt(0)
477 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
478 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
479 ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0
480 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
481 ; GFX11-NEXT: s_nop 0
482 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
483 ; GFX11-NEXT: s_endpgm
484 %tid = call i32 @llvm.amdgcn.workitem.id.x()
485 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
486 %val = load <2 x half>, ptr addrspace(1) %gep
487 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
488 %fmul = fmul <2 x half> %fabs, %val
489 store <2 x half> %fmul, ptr addrspace(1) %out
493 define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 {
494 ; CI-LABEL: v_fabs_fold_v2f16:
496 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
497 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
498 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
499 ; CI-NEXT: s_waitcnt lgkmcnt(0)
500 ; CI-NEXT: v_mov_b32_e32 v1, s3
501 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
502 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
503 ; CI-NEXT: flat_load_dword v0, v[0:1]
504 ; CI-NEXT: s_lshr_b32 s2, s4, 16
505 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
506 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
507 ; CI-NEXT: s_waitcnt vmcnt(0)
508 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
509 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
510 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
511 ; CI-NEXT: v_mul_f32_e32 v1, v2, v1
512 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
513 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
514 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
515 ; CI-NEXT: v_mov_b32_e32 v0, s0
516 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
517 ; CI-NEXT: v_mov_b32_e32 v1, s1
518 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
519 ; CI-NEXT: flat_store_dword v[0:1], v2
522 ; VI-LABEL: v_fabs_fold_v2f16:
524 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
525 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
526 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
528 ; VI-NEXT: v_mov_b32_e32 v1, s3
529 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
530 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
531 ; VI-NEXT: flat_load_dword v2, v[0:1]
532 ; VI-NEXT: v_mov_b32_e32 v0, s0
533 ; VI-NEXT: s_lshr_b32 s0, s4, 16
534 ; VI-NEXT: v_mov_b32_e32 v3, s0
535 ; VI-NEXT: v_mov_b32_e32 v1, s1
536 ; VI-NEXT: s_waitcnt vmcnt(0)
537 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
538 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, s4
539 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
540 ; VI-NEXT: flat_store_dword v[0:1], v2
543 ; GFX9-LABEL: v_fabs_fold_v2f16:
545 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
546 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
547 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
548 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
549 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
551 ; GFX9-NEXT: s_waitcnt vmcnt(0)
552 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
553 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s6
554 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
555 ; GFX9-NEXT: s_endpgm
557 ; GFX11-LABEL: v_fabs_fold_v2f16:
559 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
560 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
561 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10
562 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
564 ; GFX11-NEXT: s_waitcnt vmcnt(0)
565 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
566 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
567 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0
568 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
569 ; GFX11-NEXT: s_nop 0
570 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
571 ; GFX11-NEXT: s_endpgm
572 %tid = call i32 @llvm.amdgcn.workitem.id.x()
573 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
574 %val = load <2 x half>, ptr addrspace(1) %gep
575 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
576 %other.val.cvt = bitcast i32 %other.val to <2 x half>
577 %fmul = fmul <2 x half> %fabs, %other.val.cvt
578 store <2 x half> %fmul, ptr addrspace(1) %out
582 define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
583 ; CI-LABEL: v_extract_fabs_fold_v2f16:
585 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
586 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
587 ; CI-NEXT: s_waitcnt lgkmcnt(0)
588 ; CI-NEXT: v_mov_b32_e32 v1, s1
589 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
590 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
591 ; CI-NEXT: flat_load_dword v0, v[0:1]
592 ; CI-NEXT: s_waitcnt vmcnt(0)
593 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
594 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
595 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
596 ; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0
597 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
598 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
599 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
600 ; CI-NEXT: flat_store_short v[0:1], v0
601 ; CI-NEXT: s_waitcnt vmcnt(0)
602 ; CI-NEXT: flat_store_short v[0:1], v1
603 ; CI-NEXT: s_waitcnt vmcnt(0)
606 ; VI-LABEL: v_extract_fabs_fold_v2f16:
608 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
609 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
610 ; VI-NEXT: s_waitcnt lgkmcnt(0)
611 ; VI-NEXT: v_mov_b32_e32 v1, s1
612 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
613 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
614 ; VI-NEXT: flat_load_dword v0, v[0:1]
615 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
616 ; VI-NEXT: s_waitcnt vmcnt(0)
617 ; VI-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
618 ; VI-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
619 ; VI-NEXT: flat_store_short v[0:1], v2
620 ; VI-NEXT: s_waitcnt vmcnt(0)
621 ; VI-NEXT: flat_store_short v[0:1], v0
622 ; VI-NEXT: s_waitcnt vmcnt(0)
625 ; GFX9-LABEL: v_extract_fabs_fold_v2f16:
627 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
628 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
629 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000
630 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
631 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
633 ; GFX9-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
634 ; GFX9-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
635 ; GFX9-NEXT: global_store_short v[0:1], v2, off
636 ; GFX9-NEXT: s_waitcnt vmcnt(0)
637 ; GFX9-NEXT: global_store_short v[0:1], v0, off
638 ; GFX9-NEXT: s_waitcnt vmcnt(0)
639 ; GFX9-NEXT: s_endpgm
641 ; GFX11-LABEL: v_extract_fabs_fold_v2f16:
643 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
644 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
645 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
646 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
647 ; GFX11-NEXT: s_waitcnt vmcnt(0)
648 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
649 ; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
650 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
651 ; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
652 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
653 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
654 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
655 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
656 ; GFX11-NEXT: s_nop 0
657 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
658 ; GFX11-NEXT: s_endpgm
659 %tid = call i32 @llvm.amdgcn.workitem.id.x()
660 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
661 %val = load <2 x half>, ptr addrspace(1) %gep.in
662 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
663 %elt0 = extractelement <2 x half> %fabs, i32 0
664 %elt1 = extractelement <2 x half> %fabs, i32 1
666 %fmul0 = fmul half %elt0, 4.0
667 %fadd1 = fadd half %elt1, 2.0
668 store volatile half %fmul0, ptr addrspace(1) undef
669 store volatile half %fadd1, ptr addrspace(1) undef
673 define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 {
674 ; CI-LABEL: v_extract_fabs_no_fold_v2f16:
676 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
677 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
678 ; CI-NEXT: s_waitcnt lgkmcnt(0)
679 ; CI-NEXT: v_mov_b32_e32 v1, s1
680 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
681 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
682 ; CI-NEXT: flat_load_dword v0, v[0:1]
683 ; CI-NEXT: s_waitcnt vmcnt(0)
684 ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15
685 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
686 ; CI-NEXT: flat_store_short v[0:1], v0
687 ; CI-NEXT: s_waitcnt vmcnt(0)
688 ; CI-NEXT: flat_store_short v[0:1], v1
689 ; CI-NEXT: s_waitcnt vmcnt(0)
692 ; VI-LABEL: v_extract_fabs_no_fold_v2f16:
694 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
695 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
696 ; VI-NEXT: s_waitcnt lgkmcnt(0)
697 ; VI-NEXT: v_mov_b32_e32 v1, s1
698 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
699 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
700 ; VI-NEXT: flat_load_dword v0, v[0:1]
701 ; VI-NEXT: s_waitcnt vmcnt(0)
702 ; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
703 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 15
704 ; VI-NEXT: flat_store_short v[0:1], v1
705 ; VI-NEXT: s_waitcnt vmcnt(0)
706 ; VI-NEXT: flat_store_short v[0:1], v0
707 ; VI-NEXT: s_waitcnt vmcnt(0)
710 ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16:
712 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
713 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
714 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
715 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
716 ; GFX9-NEXT: s_waitcnt vmcnt(0)
717 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
718 ; GFX9-NEXT: global_store_short v[0:1], v0, off
719 ; GFX9-NEXT: s_waitcnt vmcnt(0)
720 ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
721 ; GFX9-NEXT: s_waitcnt vmcnt(0)
722 ; GFX9-NEXT: s_endpgm
724 ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16:
726 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
727 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
728 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
729 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
730 ; GFX11-NEXT: s_waitcnt vmcnt(0)
731 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
732 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
733 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
734 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
735 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
736 ; GFX11-NEXT: s_nop 0
737 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
738 ; GFX11-NEXT: s_endpgm
739 %tid = call i32 @llvm.amdgcn.workitem.id.x()
740 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
741 %val = load <2 x half>, ptr addrspace(1) %gep.in
742 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
743 %elt0 = extractelement <2 x half> %fabs, i32 0
744 %elt1 = extractelement <2 x half> %fabs, i32 1
745 store volatile half %elt0, ptr addrspace(1) undef
746 store volatile half %elt1, ptr addrspace(1) undef
750 declare half @llvm.fabs.f16(half) #1
751 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
752 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
753 declare i32 @llvm.amdgcn.workitem.id.x() #1
755 attributes #0 = { nounwind }
756 attributes #1 = { nounwind readnone }