1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 ; DAGCombiner will transform:
8 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
9 ; unless isFabsFree returns true
11 define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
12 ; CI-LABEL: s_fabs_free_f16:
14 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
15 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
16 ; CI-NEXT: s_waitcnt lgkmcnt(0)
17 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
18 ; CI-NEXT: v_mov_b32_e32 v0, s0
19 ; CI-NEXT: v_mov_b32_e32 v1, s1
20 ; CI-NEXT: v_mov_b32_e32 v2, s2
21 ; CI-NEXT: flat_store_short v[0:1], v2
24 ; VI-LABEL: s_fabs_free_f16:
26 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
30 ; VI-NEXT: v_mov_b32_e32 v0, s0
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: v_mov_b32_e32 v2, s2
33 ; VI-NEXT: flat_store_short v[0:1], v2
36 ; GFX9-LABEL: s_fabs_free_f16:
38 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
39 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
40 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
43 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
44 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
47 ; GFX11-LABEL: s_fabs_free_f16:
49 ; GFX11-NEXT: s_clause 0x1
50 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
51 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
52 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
54 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
55 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
56 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
57 ; GFX11-NEXT: s_endpgm
58 %bc= bitcast i16 %in to half
59 %fabs = call half @llvm.fabs.f16(half %bc)
60 store half %fabs, ptr addrspace(1) %out
64 define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
65 ; CI-LABEL: s_fabs_f16:
67 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
68 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
69 ; CI-NEXT: s_waitcnt lgkmcnt(0)
70 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
71 ; CI-NEXT: v_mov_b32_e32 v0, s0
72 ; CI-NEXT: v_mov_b32_e32 v1, s1
73 ; CI-NEXT: v_mov_b32_e32 v2, s2
74 ; CI-NEXT: flat_store_short v[0:1], v2
77 ; VI-LABEL: s_fabs_f16:
79 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
80 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
81 ; VI-NEXT: s_waitcnt lgkmcnt(0)
82 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
83 ; VI-NEXT: v_mov_b32_e32 v0, s0
84 ; VI-NEXT: v_mov_b32_e32 v1, s1
85 ; VI-NEXT: v_mov_b32_e32 v2, s2
86 ; VI-NEXT: flat_store_short v[0:1], v2
89 ; GFX9-LABEL: s_fabs_f16:
91 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
92 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
93 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
94 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
95 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
96 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
97 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
100 ; GFX11-LABEL: s_fabs_f16:
102 ; GFX11-NEXT: s_clause 0x1
103 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
104 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
105 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
106 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
107 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
108 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
109 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
110 ; GFX11-NEXT: s_endpgm
111 %fabs = call half @llvm.fabs.f16(half %in)
112 store half %fabs, ptr addrspace(1) %out
116 define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
117 ; CI-LABEL: s_fabs_v2f16:
119 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
120 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
121 ; CI-NEXT: s_waitcnt lgkmcnt(0)
122 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
123 ; CI-NEXT: v_mov_b32_e32 v0, s0
124 ; CI-NEXT: v_mov_b32_e32 v1, s1
125 ; CI-NEXT: v_mov_b32_e32 v2, s2
126 ; CI-NEXT: flat_store_dword v[0:1], v2
129 ; VI-LABEL: s_fabs_v2f16:
131 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
132 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
134 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
135 ; VI-NEXT: v_mov_b32_e32 v0, s0
136 ; VI-NEXT: v_mov_b32_e32 v1, s1
137 ; VI-NEXT: v_mov_b32_e32 v2, s2
138 ; VI-NEXT: flat_store_dword v[0:1], v2
141 ; GFX9-LABEL: s_fabs_v2f16:
143 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
144 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
145 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
146 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
147 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
148 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
149 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
150 ; GFX9-NEXT: s_endpgm
152 ; GFX11-LABEL: s_fabs_v2f16:
154 ; GFX11-NEXT: s_clause 0x1
155 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
156 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
159 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
160 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
161 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
162 ; GFX11-NEXT: s_endpgm
163 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
164 store <2 x half> %fabs, ptr addrspace(1) %out
168 define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
169 ; CI-LABEL: s_fabs_v4f16:
171 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
172 ; CI-NEXT: s_waitcnt lgkmcnt(0)
173 ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
174 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
175 ; CI-NEXT: v_mov_b32_e32 v3, s1
176 ; CI-NEXT: v_mov_b32_e32 v0, s2
177 ; CI-NEXT: v_mov_b32_e32 v1, s3
178 ; CI-NEXT: v_mov_b32_e32 v2, s0
179 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
182 ; VI-LABEL: s_fabs_v4f16:
184 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
185 ; VI-NEXT: s_waitcnt lgkmcnt(0)
186 ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
187 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
188 ; VI-NEXT: v_mov_b32_e32 v3, s1
189 ; VI-NEXT: v_mov_b32_e32 v0, s2
190 ; VI-NEXT: v_mov_b32_e32 v1, s3
191 ; VI-NEXT: v_mov_b32_e32 v2, s0
192 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
195 ; GFX9-LABEL: s_fabs_v4f16:
197 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
198 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
199 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff
201 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
202 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
203 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
204 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
205 ; GFX9-NEXT: s_endpgm
207 ; GFX11-LABEL: s_fabs_v4f16:
209 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
210 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
212 ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
213 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
214 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
215 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
216 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
217 ; GFX11-NEXT: s_endpgm
218 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
219 store <4 x half> %fabs, ptr addrspace(1) %out
223 define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
224 ; CI-LABEL: fabs_fold_f16:
226 ; CI-NEXT: s_load_dword s0, s[8:9], 0x2
227 ; CI-NEXT: s_waitcnt lgkmcnt(0)
228 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
229 ; CI-NEXT: s_lshr_b32 s0, s0, 16
230 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
231 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
232 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1
233 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
234 ; CI-NEXT: s_waitcnt lgkmcnt(0)
235 ; CI-NEXT: v_mov_b32_e32 v0, s0
236 ; CI-NEXT: v_mov_b32_e32 v1, s1
237 ; CI-NEXT: flat_store_short v[0:1], v2
240 ; VI-LABEL: fabs_fold_f16:
242 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
243 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
244 ; VI-NEXT: s_waitcnt lgkmcnt(0)
245 ; VI-NEXT: s_lshr_b32 s3, s2, 16
246 ; VI-NEXT: v_mov_b32_e32 v0, s3
247 ; VI-NEXT: v_mul_f16_e64 v2, |s2|, v0
248 ; VI-NEXT: v_mov_b32_e32 v0, s0
249 ; VI-NEXT: v_mov_b32_e32 v1, s1
250 ; VI-NEXT: flat_store_short v[0:1], v2
253 ; GFX9-LABEL: fabs_fold_f16:
255 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
256 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
257 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
259 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
260 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
261 ; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1
262 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
263 ; GFX9-NEXT: s_endpgm
265 ; GFX11-LABEL: fabs_fold_f16:
267 ; GFX11-NEXT: s_clause 0x1
268 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
269 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
270 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
271 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
272 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
273 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
274 ; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3
275 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
276 ; GFX11-NEXT: s_endpgm
277 %fabs = call half @llvm.fabs.f16(half %in0)
278 %fmul = fmul half %fabs, %in1
279 store half %fmul, ptr addrspace(1) %out
283 define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
284 ; CI-LABEL: v_fabs_v2f16:
286 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
287 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
288 ; CI-NEXT: s_waitcnt lgkmcnt(0)
289 ; CI-NEXT: v_mov_b32_e32 v1, s1
290 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
291 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
292 ; CI-NEXT: flat_load_dword v2, v[0:1]
293 ; CI-NEXT: s_waitcnt vmcnt(0)
294 ; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
295 ; CI-NEXT: flat_store_dword v[0:1], v2
298 ; VI-LABEL: v_fabs_v2f16:
300 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
301 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
302 ; VI-NEXT: s_waitcnt lgkmcnt(0)
303 ; VI-NEXT: v_mov_b32_e32 v1, s1
304 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
305 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
306 ; VI-NEXT: flat_load_dword v2, v[0:1]
307 ; VI-NEXT: s_waitcnt vmcnt(0)
308 ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
309 ; VI-NEXT: flat_store_dword v[0:1], v2
312 ; GFX9-LABEL: v_fabs_v2f16:
314 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
315 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
316 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
317 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
318 ; GFX9-NEXT: s_waitcnt vmcnt(0)
319 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
320 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
321 ; GFX9-NEXT: s_endpgm
323 ; GFX11-LABEL: v_fabs_v2f16:
325 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
326 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
327 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
328 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
331 ; GFX11-NEXT: s_waitcnt vmcnt(0)
332 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
333 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
334 ; GFX11-NEXT: s_endpgm
335 %tid = call i32 @llvm.amdgcn.workitem.id.x()
336 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
337 %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
338 %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
339 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
340 store <2 x half> %fabs, ptr addrspace(1) %gep.out
344 define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
345 ; CI-LABEL: fabs_free_v2f16:
347 ; CI-NEXT: s_load_dword s2, s[8:9], 0x2
348 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
349 ; CI-NEXT: s_waitcnt lgkmcnt(0)
350 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
351 ; CI-NEXT: v_mov_b32_e32 v0, s0
352 ; CI-NEXT: v_mov_b32_e32 v1, s1
353 ; CI-NEXT: v_mov_b32_e32 v2, s2
354 ; CI-NEXT: flat_store_dword v[0:1], v2
357 ; VI-LABEL: fabs_free_v2f16:
359 ; VI-NEXT: s_load_dword s2, s[8:9], 0x8
360 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
361 ; VI-NEXT: s_waitcnt lgkmcnt(0)
362 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
363 ; VI-NEXT: v_mov_b32_e32 v0, s0
364 ; VI-NEXT: v_mov_b32_e32 v1, s1
365 ; VI-NEXT: v_mov_b32_e32 v2, s2
366 ; VI-NEXT: flat_store_dword v[0:1], v2
369 ; GFX9-LABEL: fabs_free_v2f16:
371 ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
372 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
373 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
374 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
375 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
376 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
377 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
378 ; GFX9-NEXT: s_endpgm
380 ; GFX11-LABEL: fabs_free_v2f16:
382 ; GFX11-NEXT: s_clause 0x1
383 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
384 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
385 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
386 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
387 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
388 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
389 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
390 ; GFX11-NEXT: s_endpgm
391 %bc = bitcast i32 %in to <2 x half>
392 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
393 store <2 x half> %fabs, ptr addrspace(1) %out
397 ; FIXME: Should do fabs after conversion to avoid converting multiple
398 ; times in this particular case.
399 define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
400 ; CI-LABEL: v_fabs_fold_self_v2f16:
402 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
403 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
404 ; CI-NEXT: s_waitcnt lgkmcnt(0)
405 ; CI-NEXT: v_mov_b32_e32 v1, s3
406 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
407 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
408 ; CI-NEXT: flat_load_dword v0, v[0:1]
409 ; CI-NEXT: s_waitcnt vmcnt(0)
410 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
411 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
412 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
413 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
414 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
415 ; CI-NEXT: v_mul_f32_e32 v1, v1, v2
416 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
417 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
418 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
419 ; CI-NEXT: v_mov_b32_e32 v0, s0
420 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
421 ; CI-NEXT: v_mov_b32_e32 v1, s1
422 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
423 ; CI-NEXT: flat_store_dword v[0:1], v2
426 ; VI-LABEL: v_fabs_fold_self_v2f16:
428 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
429 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
431 ; VI-NEXT: v_mov_b32_e32 v1, s3
432 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
433 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
434 ; VI-NEXT: flat_load_dword v2, v[0:1]
435 ; VI-NEXT: v_mov_b32_e32 v0, s0
436 ; VI-NEXT: v_mov_b32_e32 v1, s1
437 ; VI-NEXT: s_waitcnt vmcnt(0)
438 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
439 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, v2
440 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
441 ; VI-NEXT: flat_store_dword v[0:1], v2
444 ; GFX9-LABEL: v_fabs_fold_self_v2f16:
446 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
447 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
448 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
449 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
450 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
451 ; GFX9-NEXT: s_waitcnt vmcnt(0)
452 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0
453 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
454 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
455 ; GFX9-NEXT: s_endpgm
457 ; GFX11-LABEL: v_fabs_fold_self_v2f16:
459 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
460 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
461 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
462 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
463 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
464 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
465 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
466 ; GFX11-NEXT: s_waitcnt vmcnt(0)
467 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
468 ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0
469 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
470 ; GFX11-NEXT: s_endpgm
471 %tid = call i32 @llvm.amdgcn.workitem.id.x()
472 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
473 %val = load <2 x half>, ptr addrspace(1) %gep
474 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
475 %fmul = fmul <2 x half> %fabs, %val
476 store <2 x half> %fmul, ptr addrspace(1) %out
480 define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 {
481 ; CI-LABEL: v_fabs_fold_v2f16:
483 ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
484 ; CI-NEXT: s_load_dword s4, s[8:9], 0x4
485 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
486 ; CI-NEXT: s_waitcnt lgkmcnt(0)
487 ; CI-NEXT: v_mov_b32_e32 v1, s3
488 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
489 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
490 ; CI-NEXT: flat_load_dword v0, v[0:1]
491 ; CI-NEXT: s_lshr_b32 s2, s4, 16
492 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
493 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
494 ; CI-NEXT: s_waitcnt vmcnt(0)
495 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
496 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
497 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
498 ; CI-NEXT: v_mul_f32_e32 v1, v2, v1
499 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
500 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
501 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
502 ; CI-NEXT: v_mov_b32_e32 v0, s0
503 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
504 ; CI-NEXT: v_mov_b32_e32 v1, s1
505 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
506 ; CI-NEXT: flat_store_dword v[0:1], v2
509 ; VI-LABEL: v_fabs_fold_v2f16:
511 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
512 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
513 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
514 ; VI-NEXT: s_waitcnt lgkmcnt(0)
515 ; VI-NEXT: v_mov_b32_e32 v1, s3
516 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
517 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
518 ; VI-NEXT: flat_load_dword v2, v[0:1]
519 ; VI-NEXT: v_mov_b32_e32 v0, s0
520 ; VI-NEXT: s_lshr_b32 s0, s4, 16
521 ; VI-NEXT: v_mov_b32_e32 v3, s0
522 ; VI-NEXT: v_mov_b32_e32 v1, s1
523 ; VI-NEXT: s_waitcnt vmcnt(0)
524 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
525 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, s4
526 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
527 ; VI-NEXT: flat_store_dword v[0:1], v2
530 ; GFX9-LABEL: v_fabs_fold_v2f16:
532 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
533 ; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10
534 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
535 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
536 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
537 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
538 ; GFX9-NEXT: s_waitcnt vmcnt(0)
539 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
540 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
541 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
542 ; GFX9-NEXT: s_endpgm
544 ; GFX11-LABEL: v_fabs_fold_v2f16:
546 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
547 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
548 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
549 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
550 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
551 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
552 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
553 ; GFX11-NEXT: s_waitcnt vmcnt(0)
554 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
555 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s4
556 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
557 ; GFX11-NEXT: s_endpgm
558 %tid = call i32 @llvm.amdgcn.workitem.id.x()
559 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
560 %val = load <2 x half>, ptr addrspace(1) %gep
561 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
562 %other.val.cvt = bitcast i32 %other.val to <2 x half>
563 %fmul = fmul <2 x half> %fabs, %other.val.cvt
564 store <2 x half> %fmul, ptr addrspace(1) %out
568 define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
569 ; CI-LABEL: v_extract_fabs_fold_v2f16:
571 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
572 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
573 ; CI-NEXT: s_waitcnt lgkmcnt(0)
574 ; CI-NEXT: v_mov_b32_e32 v1, s1
575 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
576 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
577 ; CI-NEXT: flat_load_dword v0, v[0:1]
578 ; CI-NEXT: s_waitcnt vmcnt(0)
579 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
580 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
581 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
582 ; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0
583 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
584 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
585 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
586 ; CI-NEXT: flat_store_short v[0:1], v0
587 ; CI-NEXT: s_waitcnt vmcnt(0)
588 ; CI-NEXT: flat_store_short v[0:1], v1
589 ; CI-NEXT: s_waitcnt vmcnt(0)
592 ; VI-LABEL: v_extract_fabs_fold_v2f16:
594 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
595 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
596 ; VI-NEXT: s_waitcnt lgkmcnt(0)
597 ; VI-NEXT: v_mov_b32_e32 v1, s1
598 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
599 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
600 ; VI-NEXT: flat_load_dword v0, v[0:1]
601 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
602 ; VI-NEXT: s_waitcnt vmcnt(0)
603 ; VI-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
604 ; VI-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
605 ; VI-NEXT: flat_store_short v[0:1], v2
606 ; VI-NEXT: s_waitcnt vmcnt(0)
607 ; VI-NEXT: flat_store_short v[0:1], v0
608 ; VI-NEXT: s_waitcnt vmcnt(0)
611 ; GFX9-LABEL: v_extract_fabs_fold_v2f16:
613 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
614 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
615 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000
616 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
617 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
619 ; GFX9-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
620 ; GFX9-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
621 ; GFX9-NEXT: global_store_short v[0:1], v2, off
622 ; GFX9-NEXT: s_waitcnt vmcnt(0)
623 ; GFX9-NEXT: global_store_short v[0:1], v0, off
624 ; GFX9-NEXT: s_waitcnt vmcnt(0)
625 ; GFX9-NEXT: s_endpgm
627 ; GFX11-LABEL: v_extract_fabs_fold_v2f16:
629 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
630 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
631 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
632 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
633 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
634 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
635 ; GFX11-NEXT: s_waitcnt vmcnt(0)
636 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
637 ; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
638 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
639 ; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
640 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
641 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
642 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
643 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
644 ; GFX11-NEXT: s_endpgm
645 %tid = call i32 @llvm.amdgcn.workitem.id.x()
646 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
647 %val = load <2 x half>, ptr addrspace(1) %gep.in
648 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
649 %elt0 = extractelement <2 x half> %fabs, i32 0
650 %elt1 = extractelement <2 x half> %fabs, i32 1
652 %fmul0 = fmul half %elt0, 4.0
653 %fadd1 = fadd half %elt1, 2.0
654 store volatile half %fmul0, ptr addrspace(1) undef
655 store volatile half %fadd1, ptr addrspace(1) undef
659 define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 {
660 ; CI-LABEL: v_extract_fabs_no_fold_v2f16:
662 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
663 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
664 ; CI-NEXT: s_waitcnt lgkmcnt(0)
665 ; CI-NEXT: v_mov_b32_e32 v1, s1
666 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
667 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
668 ; CI-NEXT: flat_load_dword v0, v[0:1]
669 ; CI-NEXT: s_waitcnt vmcnt(0)
670 ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15
671 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
672 ; CI-NEXT: flat_store_short v[0:1], v0
673 ; CI-NEXT: s_waitcnt vmcnt(0)
674 ; CI-NEXT: flat_store_short v[0:1], v1
675 ; CI-NEXT: s_waitcnt vmcnt(0)
678 ; VI-LABEL: v_extract_fabs_no_fold_v2f16:
680 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
681 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
682 ; VI-NEXT: s_waitcnt lgkmcnt(0)
683 ; VI-NEXT: v_mov_b32_e32 v1, s1
684 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
685 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
686 ; VI-NEXT: flat_load_dword v0, v[0:1]
687 ; VI-NEXT: s_waitcnt vmcnt(0)
688 ; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
689 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 15
690 ; VI-NEXT: flat_store_short v[0:1], v1
691 ; VI-NEXT: s_waitcnt vmcnt(0)
692 ; VI-NEXT: flat_store_short v[0:1], v0
693 ; VI-NEXT: s_waitcnt vmcnt(0)
696 ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16:
698 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
699 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
700 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
701 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
703 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
704 ; GFX9-NEXT: global_store_short v[0:1], v0, off
705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
706 ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
707 ; GFX9-NEXT: s_waitcnt vmcnt(0)
708 ; GFX9-NEXT: s_endpgm
710 ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16:
712 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
713 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
714 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
715 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
716 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
717 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
718 ; GFX11-NEXT: s_waitcnt vmcnt(0)
719 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
720 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
721 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
722 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
723 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
724 ; GFX11-NEXT: s_endpgm
725 %tid = call i32 @llvm.amdgcn.workitem.id.x()
726 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
727 %val = load <2 x half>, ptr addrspace(1) %gep.in
728 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
729 %elt0 = extractelement <2 x half> %fabs, i32 0
730 %elt1 = extractelement <2 x half> %fabs, i32 1
731 store volatile half %elt0, ptr addrspace(1) undef
732 store volatile half %elt1, ptr addrspace(1) undef
736 declare half @llvm.fabs.f16(half) #1
737 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
738 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
739 declare i32 @llvm.amdgcn.workitem.id.x() #1
741 attributes #0 = { nounwind }
742 attributes #1 = { nounwind readnone }