1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
3 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 ; DAGCombiner will transform:
8 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
9 ; unless isFabsFree returns true
11 define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
12 ; CI-LABEL: s_fabs_free_f16:
14 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
15 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
16 ; CI-NEXT: s_waitcnt lgkmcnt(0)
17 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
18 ; CI-NEXT: v_mov_b32_e32 v0, s0
19 ; CI-NEXT: v_mov_b32_e32 v1, s1
20 ; CI-NEXT: v_mov_b32_e32 v2, s2
21 ; CI-NEXT: flat_store_short v[0:1], v2
24 ; VI-LABEL: s_fabs_free_f16:
26 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
27 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
30 ; VI-NEXT: v_mov_b32_e32 v0, s0
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: v_mov_b32_e32 v2, s2
33 ; VI-NEXT: flat_store_short v[0:1], v2
36 ; GFX9-LABEL: s_fabs_free_f16:
38 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
39 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
40 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
42 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
43 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
44 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
47 ; GFX11-LABEL: s_fabs_free_f16:
49 ; GFX11-NEXT: s_clause 0x1
50 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
51 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
52 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
53 ; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff
54 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
55 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
56 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
58 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
59 ; GFX11-NEXT: s_endpgm
60 %bc= bitcast i16 %in to half
61 %fabs = call half @llvm.fabs.f16(half %bc)
62 store half %fabs, ptr addrspace(1) %out
66 define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
67 ; CI-LABEL: s_fabs_f16:
69 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
70 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
71 ; CI-NEXT: s_waitcnt lgkmcnt(0)
72 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff
73 ; CI-NEXT: v_mov_b32_e32 v0, s0
74 ; CI-NEXT: v_mov_b32_e32 v1, s1
75 ; CI-NEXT: v_mov_b32_e32 v2, s2
76 ; CI-NEXT: flat_store_short v[0:1], v2
79 ; VI-LABEL: s_fabs_f16:
81 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
83 ; VI-NEXT: s_waitcnt lgkmcnt(0)
84 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
85 ; VI-NEXT: v_mov_b32_e32 v0, s0
86 ; VI-NEXT: v_mov_b32_e32 v1, s1
87 ; VI-NEXT: v_mov_b32_e32 v2, s2
88 ; VI-NEXT: flat_store_short v[0:1], v2
91 ; GFX9-LABEL: s_fabs_f16:
93 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
94 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
95 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
96 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
98 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
99 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
100 ; GFX9-NEXT: s_endpgm
102 ; GFX11-LABEL: s_fabs_f16:
104 ; GFX11-NEXT: s_clause 0x1
105 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
106 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
107 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
108 ; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff
109 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
110 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
111 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
112 ; GFX11-NEXT: s_nop 0
113 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
114 ; GFX11-NEXT: s_endpgm
115 %fabs = call half @llvm.fabs.f16(half %in)
116 store half %fabs, ptr addrspace(1) %out
120 define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
121 ; CI-LABEL: s_fabs_v2f16:
123 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
124 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
125 ; CI-NEXT: s_waitcnt lgkmcnt(0)
126 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
127 ; CI-NEXT: v_mov_b32_e32 v0, s0
128 ; CI-NEXT: v_mov_b32_e32 v1, s1
129 ; CI-NEXT: v_mov_b32_e32 v2, s2
130 ; CI-NEXT: flat_store_dword v[0:1], v2
133 ; VI-LABEL: s_fabs_v2f16:
135 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
136 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
137 ; VI-NEXT: s_waitcnt lgkmcnt(0)
138 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
139 ; VI-NEXT: v_mov_b32_e32 v0, s0
140 ; VI-NEXT: v_mov_b32_e32 v1, s1
141 ; VI-NEXT: v_mov_b32_e32 v2, s2
142 ; VI-NEXT: flat_store_dword v[0:1], v2
145 ; GFX9-LABEL: s_fabs_v2f16:
147 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
148 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
149 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
150 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
153 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
154 ; GFX9-NEXT: s_endpgm
156 ; GFX11-LABEL: s_fabs_v2f16:
158 ; GFX11-NEXT: s_clause 0x1
159 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
160 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
161 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff
163 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
164 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
165 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
166 ; GFX11-NEXT: s_nop 0
167 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
168 ; GFX11-NEXT: s_endpgm
169 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
170 store <2 x half> %fabs, ptr addrspace(1) %out
174 define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
175 ; CI-LABEL: s_fabs_v4f16:
177 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
178 ; CI-NEXT: s_waitcnt lgkmcnt(0)
179 ; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
180 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
181 ; CI-NEXT: v_mov_b32_e32 v3, s1
182 ; CI-NEXT: v_mov_b32_e32 v0, s2
183 ; CI-NEXT: v_mov_b32_e32 v1, s3
184 ; CI-NEXT: v_mov_b32_e32 v2, s0
185 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
188 ; VI-LABEL: s_fabs_v4f16:
190 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
191 ; VI-NEXT: s_waitcnt lgkmcnt(0)
192 ; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
193 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
194 ; VI-NEXT: v_mov_b32_e32 v3, s1
195 ; VI-NEXT: v_mov_b32_e32 v0, s2
196 ; VI-NEXT: v_mov_b32_e32 v1, s3
197 ; VI-NEXT: v_mov_b32_e32 v2, s0
198 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
201 ; GFX9-LABEL: s_fabs_v4f16:
203 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
204 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff
207 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
208 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
209 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
210 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
211 ; GFX9-NEXT: s_endpgm
213 ; GFX11-LABEL: s_fabs_v4f16:
215 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
216 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff
218 ; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff
219 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
220 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
221 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
222 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
223 ; GFX11-NEXT: s_nop 0
224 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
225 ; GFX11-NEXT: s_endpgm
226 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
227 store <4 x half> %fabs, ptr addrspace(1) %out
231 define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) {
232 ; CI-LABEL: fabs_fold_f16:
234 ; CI-NEXT: s_load_dword s0, s[6:7], 0x2
235 ; CI-NEXT: s_waitcnt lgkmcnt(0)
236 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
237 ; CI-NEXT: s_lshr_b32 s0, s0, 16
238 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0
239 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
240 ; CI-NEXT: v_mul_f32_e32 v0, v0, v1
241 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v0
242 ; CI-NEXT: s_waitcnt lgkmcnt(0)
243 ; CI-NEXT: v_mov_b32_e32 v0, s0
244 ; CI-NEXT: v_mov_b32_e32 v1, s1
245 ; CI-NEXT: flat_store_short v[0:1], v2
248 ; VI-LABEL: fabs_fold_f16:
250 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
251 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
252 ; VI-NEXT: s_waitcnt lgkmcnt(0)
253 ; VI-NEXT: s_lshr_b32 s3, s2, 16
254 ; VI-NEXT: v_mov_b32_e32 v0, s3
255 ; VI-NEXT: v_mul_f16_e64 v2, |s2|, v0
256 ; VI-NEXT: v_mov_b32_e32 v0, s0
257 ; VI-NEXT: v_mov_b32_e32 v1, s1
258 ; VI-NEXT: flat_store_short v[0:1], v2
261 ; GFX9-LABEL: fabs_fold_f16:
263 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
264 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
265 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16
268 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
269 ; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1
270 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
271 ; GFX9-NEXT: s_endpgm
273 ; GFX11-LABEL: fabs_fold_f16:
275 ; GFX11-NEXT: s_clause 0x1
276 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
277 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
278 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
279 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
281 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
282 ; GFX11-NEXT: v_mul_f16_e64 v1, |s4|, s2
283 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
284 ; GFX11-NEXT: s_nop 0
285 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
286 ; GFX11-NEXT: s_endpgm
287 %fabs = call half @llvm.fabs.f16(half %in0)
288 %fmul = fmul half %fabs, %in1
289 store half %fmul, ptr addrspace(1) %out
293 define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
294 ; CI-LABEL: v_fabs_v2f16:
296 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x2
297 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
298 ; CI-NEXT: s_waitcnt lgkmcnt(0)
299 ; CI-NEXT: v_mov_b32_e32 v1, s1
300 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
301 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
302 ; CI-NEXT: flat_load_dword v2, v[0:1]
303 ; CI-NEXT: s_waitcnt vmcnt(0)
304 ; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
305 ; CI-NEXT: flat_store_dword v[0:1], v2
308 ; VI-LABEL: v_fabs_v2f16:
310 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8
311 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
312 ; VI-NEXT: s_waitcnt lgkmcnt(0)
313 ; VI-NEXT: v_mov_b32_e32 v1, s1
314 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
315 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
316 ; VI-NEXT: flat_load_dword v2, v[0:1]
317 ; VI-NEXT: s_waitcnt vmcnt(0)
318 ; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2
319 ; VI-NEXT: flat_store_dword v[0:1], v2
322 ; GFX9-LABEL: v_fabs_v2f16:
324 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8
325 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
326 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
327 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
328 ; GFX9-NEXT: s_waitcnt vmcnt(0)
329 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
330 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
331 ; GFX9-NEXT: s_endpgm
333 ; GFX11-LABEL: v_fabs_v2f16:
335 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x8
336 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
337 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
338 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
339 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
340 ; GFX11-NEXT: global_load_b32 v1, v0, s[0:1]
341 ; GFX11-NEXT: s_waitcnt vmcnt(0)
342 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
343 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
344 ; GFX11-NEXT: s_nop 0
345 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
346 ; GFX11-NEXT: s_endpgm
347 %tid = call i32 @llvm.amdgcn.workitem.id.x()
348 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
349 %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
350 %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2
351 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
352 store <2 x half> %fabs, ptr addrspace(1) %gep.out
356 define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
357 ; CI-LABEL: fabs_free_v2f16:
359 ; CI-NEXT: s_load_dword s2, s[6:7], 0x2
360 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
361 ; CI-NEXT: s_waitcnt lgkmcnt(0)
362 ; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
363 ; CI-NEXT: v_mov_b32_e32 v0, s0
364 ; CI-NEXT: v_mov_b32_e32 v1, s1
365 ; CI-NEXT: v_mov_b32_e32 v2, s2
366 ; CI-NEXT: flat_store_dword v[0:1], v2
369 ; VI-LABEL: fabs_free_v2f16:
371 ; VI-NEXT: s_load_dword s2, s[6:7], 0x8
372 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
374 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
375 ; VI-NEXT: v_mov_b32_e32 v0, s0
376 ; VI-NEXT: v_mov_b32_e32 v1, s1
377 ; VI-NEXT: v_mov_b32_e32 v2, s2
378 ; VI-NEXT: flat_store_dword v[0:1], v2
381 ; GFX9-LABEL: fabs_free_v2f16:
383 ; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
384 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
385 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
386 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
387 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff
388 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
389 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
390 ; GFX9-NEXT: s_endpgm
392 ; GFX11-LABEL: fabs_free_v2f16:
394 ; GFX11-NEXT: s_clause 0x1
395 ; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
396 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
397 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
398 ; GFX11-NEXT: s_and_b32 s2, s4, 0x7fff7fff
399 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
400 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
401 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
402 ; GFX11-NEXT: s_nop 0
403 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
404 ; GFX11-NEXT: s_endpgm
405 %bc = bitcast i32 %in to <2 x half>
406 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
407 store <2 x half> %fabs, ptr addrspace(1) %out
411 ; FIXME: Should do fabs after conversion to avoid converting multiple
412 ; times in this particular case.
413 define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
414 ; CI-LABEL: v_fabs_fold_self_v2f16:
416 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
417 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
418 ; CI-NEXT: s_waitcnt lgkmcnt(0)
419 ; CI-NEXT: v_mov_b32_e32 v1, s3
420 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
421 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
422 ; CI-NEXT: flat_load_dword v0, v[0:1]
423 ; CI-NEXT: s_waitcnt vmcnt(0)
424 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
425 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
426 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
427 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v0
428 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
429 ; CI-NEXT: v_mul_f32_e32 v1, v1, v2
430 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
431 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
432 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
433 ; CI-NEXT: v_mov_b32_e32 v0, s0
434 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
435 ; CI-NEXT: v_mov_b32_e32 v1, s1
436 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
437 ; CI-NEXT: flat_store_dword v[0:1], v2
440 ; VI-LABEL: v_fabs_fold_self_v2f16:
442 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
443 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
444 ; VI-NEXT: s_waitcnt lgkmcnt(0)
445 ; VI-NEXT: v_mov_b32_e32 v1, s3
446 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
447 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
448 ; VI-NEXT: flat_load_dword v2, v[0:1]
449 ; VI-NEXT: v_mov_b32_e32 v0, s0
450 ; VI-NEXT: v_mov_b32_e32 v1, s1
451 ; VI-NEXT: s_waitcnt vmcnt(0)
452 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
453 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, v2
454 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
455 ; VI-NEXT: flat_store_dword v[0:1], v2
458 ; GFX9-LABEL: v_fabs_fold_self_v2f16:
460 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
461 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
462 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
463 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
465 ; GFX9-NEXT: s_waitcnt vmcnt(0)
466 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0
467 ; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0
468 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
469 ; GFX9-NEXT: s_endpgm
471 ; GFX11-LABEL: v_fabs_fold_self_v2f16:
473 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
474 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
475 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
476 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
477 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
478 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
479 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
480 ; GFX11-NEXT: s_waitcnt vmcnt(0)
481 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
482 ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0
483 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
484 ; GFX11-NEXT: s_nop 0
485 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
486 ; GFX11-NEXT: s_endpgm
487 %tid = call i32 @llvm.amdgcn.workitem.id.x()
488 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
489 %val = load <2 x half>, ptr addrspace(1) %gep
490 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
491 %fmul = fmul <2 x half> %fabs, %val
492 store <2 x half> %fmul, ptr addrspace(1) %out
496 define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 {
497 ; CI-LABEL: v_fabs_fold_v2f16:
499 ; CI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
500 ; CI-NEXT: s_load_dword s4, s[6:7], 0x4
501 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
502 ; CI-NEXT: s_waitcnt lgkmcnt(0)
503 ; CI-NEXT: v_mov_b32_e32 v1, s3
504 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
505 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
506 ; CI-NEXT: flat_load_dword v0, v[0:1]
507 ; CI-NEXT: s_lshr_b32 s2, s4, 16
508 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
509 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4
510 ; CI-NEXT: s_waitcnt vmcnt(0)
511 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
512 ; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
513 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
514 ; CI-NEXT: v_mul_f32_e32 v1, v2, v1
515 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v1
516 ; CI-NEXT: v_mul_f32_e32 v0, v0, v3
517 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v0
518 ; CI-NEXT: v_mov_b32_e32 v0, s0
519 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
520 ; CI-NEXT: v_mov_b32_e32 v1, s1
521 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
522 ; CI-NEXT: flat_store_dword v[0:1], v2
525 ; VI-LABEL: v_fabs_fold_v2f16:
527 ; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
528 ; VI-NEXT: s_load_dword s4, s[6:7], 0x10
529 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
530 ; VI-NEXT: s_waitcnt lgkmcnt(0)
531 ; VI-NEXT: v_mov_b32_e32 v1, s3
532 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
533 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
534 ; VI-NEXT: flat_load_dword v2, v[0:1]
535 ; VI-NEXT: v_mov_b32_e32 v0, s0
536 ; VI-NEXT: s_lshr_b32 s0, s4, 16
537 ; VI-NEXT: v_mov_b32_e32 v3, s0
538 ; VI-NEXT: v_mov_b32_e32 v1, s1
539 ; VI-NEXT: s_waitcnt vmcnt(0)
540 ; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
541 ; VI-NEXT: v_mul_f16_e64 v2, |v2|, s4
542 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
543 ; VI-NEXT: flat_store_dword v[0:1], v2
546 ; GFX9-LABEL: v_fabs_fold_v2f16:
548 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
549 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x10
550 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
551 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
552 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
553 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
554 ; GFX9-NEXT: s_waitcnt vmcnt(0)
555 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
556 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
557 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
558 ; GFX9-NEXT: s_endpgm
560 ; GFX11-LABEL: v_fabs_fold_v2f16:
562 ; GFX11-NEXT: s_clause 0x1
563 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
564 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x10
565 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
566 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
567 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
568 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
570 ; GFX11-NEXT: s_waitcnt vmcnt(0)
571 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
572 ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0
573 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
574 ; GFX11-NEXT: s_nop 0
575 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
576 ; GFX11-NEXT: s_endpgm
577 %tid = call i32 @llvm.amdgcn.workitem.id.x()
578 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid
579 %val = load <2 x half>, ptr addrspace(1) %gep
580 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
581 %other.val.cvt = bitcast i32 %other.val to <2 x half>
582 %fmul = fmul <2 x half> %fabs, %other.val.cvt
583 store <2 x half> %fmul, ptr addrspace(1) %out
587 define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
588 ; CI-LABEL: v_extract_fabs_fold_v2f16:
590 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
591 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
592 ; CI-NEXT: s_waitcnt lgkmcnt(0)
593 ; CI-NEXT: v_mov_b32_e32 v1, s1
594 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
595 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
596 ; CI-NEXT: flat_load_dword v0, v[0:1]
597 ; CI-NEXT: s_waitcnt vmcnt(0)
598 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
599 ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
600 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
601 ; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0
602 ; CI-NEXT: v_add_f32_e32 v1, 2.0, v1
603 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
604 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
605 ; CI-NEXT: flat_store_short v[0:1], v0
606 ; CI-NEXT: s_waitcnt vmcnt(0)
607 ; CI-NEXT: flat_store_short v[0:1], v1
608 ; CI-NEXT: s_waitcnt vmcnt(0)
611 ; VI-LABEL: v_extract_fabs_fold_v2f16:
613 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
614 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
615 ; VI-NEXT: s_waitcnt lgkmcnt(0)
616 ; VI-NEXT: v_mov_b32_e32 v1, s1
617 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
618 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
619 ; VI-NEXT: flat_load_dword v0, v[0:1]
620 ; VI-NEXT: v_mov_b32_e32 v1, 0x4000
621 ; VI-NEXT: s_waitcnt vmcnt(0)
622 ; VI-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
623 ; VI-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
624 ; VI-NEXT: flat_store_short v[0:1], v2
625 ; VI-NEXT: s_waitcnt vmcnt(0)
626 ; VI-NEXT: flat_store_short v[0:1], v0
627 ; VI-NEXT: s_waitcnt vmcnt(0)
630 ; GFX9-LABEL: v_extract_fabs_fold_v2f16:
632 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
633 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
634 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000
635 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
637 ; GFX9-NEXT: s_waitcnt vmcnt(0)
638 ; GFX9-NEXT: v_mul_f16_e64 v2, |v0|, 4.0
639 ; GFX9-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
640 ; GFX9-NEXT: global_store_short v[0:1], v2, off
641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
642 ; GFX9-NEXT: global_store_short v[0:1], v0, off
643 ; GFX9-NEXT: s_waitcnt vmcnt(0)
644 ; GFX9-NEXT: s_endpgm
646 ; GFX11-LABEL: v_extract_fabs_fold_v2f16:
648 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
649 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
650 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
651 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
652 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
654 ; GFX11-NEXT: s_waitcnt vmcnt(0)
655 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
656 ; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
657 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
658 ; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
659 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
660 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
661 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
662 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
663 ; GFX11-NEXT: s_nop 0
664 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
665 ; GFX11-NEXT: s_endpgm
666 %tid = call i32 @llvm.amdgcn.workitem.id.x()
667 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
668 %val = load <2 x half>, ptr addrspace(1) %gep.in
669 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
670 %elt0 = extractelement <2 x half> %fabs, i32 0
671 %elt1 = extractelement <2 x half> %fabs, i32 1
673 %fmul0 = fmul half %elt0, 4.0
674 %fadd1 = fadd half %elt1, 2.0
675 store volatile half %fmul0, ptr addrspace(1) undef
676 store volatile half %fadd1, ptr addrspace(1) undef
680 define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 {
681 ; CI-LABEL: v_extract_fabs_no_fold_v2f16:
683 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
684 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
685 ; CI-NEXT: s_waitcnt lgkmcnt(0)
686 ; CI-NEXT: v_mov_b32_e32 v1, s1
687 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
688 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
689 ; CI-NEXT: flat_load_dword v0, v[0:1]
690 ; CI-NEXT: s_waitcnt vmcnt(0)
691 ; CI-NEXT: v_bfe_u32 v1, v0, 16, 15
692 ; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
693 ; CI-NEXT: flat_store_short v[0:1], v0
694 ; CI-NEXT: s_waitcnt vmcnt(0)
695 ; CI-NEXT: flat_store_short v[0:1], v1
696 ; CI-NEXT: s_waitcnt vmcnt(0)
699 ; VI-LABEL: v_extract_fabs_no_fold_v2f16:
701 ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
702 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
703 ; VI-NEXT: s_waitcnt lgkmcnt(0)
704 ; VI-NEXT: v_mov_b32_e32 v1, s1
705 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
706 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
707 ; VI-NEXT: flat_load_dword v0, v[0:1]
708 ; VI-NEXT: s_waitcnt vmcnt(0)
709 ; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
710 ; VI-NEXT: v_bfe_u32 v0, v0, 16, 15
711 ; VI-NEXT: flat_store_short v[0:1], v1
712 ; VI-NEXT: s_waitcnt vmcnt(0)
713 ; VI-NEXT: flat_store_short v[0:1], v0
714 ; VI-NEXT: s_waitcnt vmcnt(0)
717 ; GFX9-LABEL: v_extract_fabs_no_fold_v2f16:
719 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
720 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
721 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
723 ; GFX9-NEXT: s_waitcnt vmcnt(0)
724 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
725 ; GFX9-NEXT: global_store_short v[0:1], v0, off
726 ; GFX9-NEXT: s_waitcnt vmcnt(0)
727 ; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
728 ; GFX9-NEXT: s_waitcnt vmcnt(0)
729 ; GFX9-NEXT: s_endpgm
731 ; GFX11-LABEL: v_extract_fabs_no_fold_v2f16:
733 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
734 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
735 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
736 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
737 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
738 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
739 ; GFX11-NEXT: s_waitcnt vmcnt(0)
740 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
741 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
742 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
743 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
744 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
745 ; GFX11-NEXT: s_nop 0
746 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
747 ; GFX11-NEXT: s_endpgm
748 %tid = call i32 @llvm.amdgcn.workitem.id.x()
749 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
750 %val = load <2 x half>, ptr addrspace(1) %gep.in
751 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
752 %elt0 = extractelement <2 x half> %fabs, i32 0
753 %elt1 = extractelement <2 x half> %fabs, i32 1
754 store volatile half %elt0, ptr addrspace(1) undef
755 store volatile half %elt1, ptr addrspace(1) undef
759 declare half @llvm.fabs.f16(half) #1
760 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
761 declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
762 declare i32 @llvm.amdgcn.workitem.id.x() #1
764 attributes #0 = { nounwind }
765 attributes #1 = { nounwind readnone }