1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s
7 declare half @llvm.copysign.f16(half, half) #0
8 declare float @llvm.copysign.f32(float, float) #0
9 declare double @llvm.copysign.f64(double, double) #0
10 declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) #0
11 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0
12 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0
13 declare i32 @llvm.amdgcn.workitem.id.x() #0
15 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) {
16 ; SI-LABEL: s_copysign_f16:
18 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
19 ; SI-NEXT: s_brev_b32 s2, -2
20 ; SI-NEXT: s_mov_b32 s3, 0xf000
21 ; SI-NEXT: s_waitcnt lgkmcnt(0)
22 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
23 ; SI-NEXT: s_lshr_b32 s0, s0, 16
24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
25 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
26 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
27 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
28 ; SI-NEXT: s_mov_b32 s2, -1
29 ; SI-NEXT: s_waitcnt lgkmcnt(0)
30 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
33 ; VI-LABEL: s_copysign_f16:
35 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
36 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
37 ; VI-NEXT: s_movk_i32 s3, 0x7fff
38 ; VI-NEXT: s_waitcnt lgkmcnt(0)
39 ; VI-NEXT: s_lshr_b32 s4, s2, 16
40 ; VI-NEXT: v_mov_b32_e32 v0, s2
41 ; VI-NEXT: v_mov_b32_e32 v1, s4
42 ; VI-NEXT: v_bfi_b32 v2, s3, v0, v1
43 ; VI-NEXT: v_mov_b32_e32 v0, s0
44 ; VI-NEXT: v_mov_b32_e32 v1, s1
45 ; VI-NEXT: flat_store_short v[0:1], v2
48 ; GFX9-LABEL: s_copysign_f16:
50 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
51 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
52 ; GFX9-NEXT: s_movk_i32 s3, 0x7fff
53 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
54 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
55 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16
56 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
57 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
58 ; GFX9-NEXT: v_bfi_b32 v1, s3, v1, v2
59 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
62 ; GFX11-LABEL: s_copysign_f16:
64 ; GFX11-NEXT: s_clause 0x1
65 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
66 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
67 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
68 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
70 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
71 ; GFX11-NEXT: v_mov_b32_e32 v0, s3
72 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
73 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
74 ; GFX11-NEXT: s_endpgm
75 %out = call half @llvm.copysign.f16(half %mag, half %sign)
76 store half %out, ptr addrspace(1) %arg_out
80 define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) {
81 ; SI-LABEL: s_test_copysign_f16_0:
83 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
84 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
85 ; SI-NEXT: s_mov_b32 s3, 0xf000
86 ; SI-NEXT: s_mov_b32 s2, -1
87 ; SI-NEXT: s_waitcnt lgkmcnt(0)
88 ; SI-NEXT: s_and_b32 s4, s6, 0x7fff
89 ; SI-NEXT: v_mov_b32_e32 v0, s4
90 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
93 ; VI-LABEL: s_test_copysign_f16_0:
95 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
96 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
97 ; VI-NEXT: s_waitcnt lgkmcnt(0)
98 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
99 ; VI-NEXT: v_mov_b32_e32 v0, s0
100 ; VI-NEXT: v_mov_b32_e32 v1, s1
101 ; VI-NEXT: v_mov_b32_e32 v2, s2
102 ; VI-NEXT: flat_store_short v[0:1], v2
105 ; GFX9-LABEL: s_test_copysign_f16_0:
107 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
108 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
109 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
112 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
113 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
114 ; GFX9-NEXT: s_endpgm
116 ; GFX11-LABEL: s_test_copysign_f16_0:
118 ; GFX11-NEXT: s_clause 0x1
119 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
120 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
121 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
123 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
124 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
125 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
126 ; GFX11-NEXT: s_endpgm
127 %result = call half @llvm.copysign.f16(half %mag, half 0.0)
128 store half %result, ptr addrspace(1) %out, align 4
132 define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) {
133 ; SI-LABEL: s_test_copysign_f16_1:
135 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
136 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
137 ; SI-NEXT: s_mov_b32 s3, 0xf000
138 ; SI-NEXT: s_mov_b32 s2, -1
139 ; SI-NEXT: s_waitcnt lgkmcnt(0)
140 ; SI-NEXT: s_and_b32 s4, s6, 0x7fff
141 ; SI-NEXT: v_mov_b32_e32 v0, s4
142 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
145 ; VI-LABEL: s_test_copysign_f16_1:
147 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
148 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
149 ; VI-NEXT: s_waitcnt lgkmcnt(0)
150 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
151 ; VI-NEXT: v_mov_b32_e32 v0, s0
152 ; VI-NEXT: v_mov_b32_e32 v1, s1
153 ; VI-NEXT: v_mov_b32_e32 v2, s2
154 ; VI-NEXT: flat_store_short v[0:1], v2
157 ; GFX9-LABEL: s_test_copysign_f16_1:
159 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
160 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
161 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
162 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
164 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
165 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
166 ; GFX9-NEXT: s_endpgm
168 ; GFX11-LABEL: s_test_copysign_f16_1:
170 ; GFX11-NEXT: s_clause 0x1
171 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
172 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
173 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
174 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
175 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
176 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
177 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
178 ; GFX11-NEXT: s_endpgm
179 %result = call half @llvm.copysign.f16(half %mag, half 1.0)
180 store half %result, ptr addrspace(1) %out, align 4
184 define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) {
185 ; SI-LABEL: s_test_copysign_f16_10.0:
187 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
188 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
189 ; SI-NEXT: s_mov_b32 s3, 0xf000
190 ; SI-NEXT: s_mov_b32 s2, -1
191 ; SI-NEXT: s_waitcnt lgkmcnt(0)
192 ; SI-NEXT: s_and_b32 s4, s6, 0x7fff
193 ; SI-NEXT: v_mov_b32_e32 v0, s4
194 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
197 ; VI-LABEL: s_test_copysign_f16_10.0:
199 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
200 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
201 ; VI-NEXT: s_waitcnt lgkmcnt(0)
202 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
203 ; VI-NEXT: v_mov_b32_e32 v0, s0
204 ; VI-NEXT: v_mov_b32_e32 v1, s1
205 ; VI-NEXT: v_mov_b32_e32 v2, s2
206 ; VI-NEXT: flat_store_short v[0:1], v2
209 ; GFX9-LABEL: s_test_copysign_f16_10.0:
211 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
212 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
213 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
214 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
215 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff
216 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
217 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
218 ; GFX9-NEXT: s_endpgm
220 ; GFX11-LABEL: s_test_copysign_f16_10.0:
222 ; GFX11-NEXT: s_clause 0x1
223 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
224 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
225 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
226 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
227 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
228 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
229 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
230 ; GFX11-NEXT: s_endpgm
231 %result = call half @llvm.copysign.f16(half %mag, half 10.0)
232 store half %result, ptr addrspace(1) %out, align 4
236 define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
237 ; SI-LABEL: s_test_copysign_f16_neg1:
239 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
240 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
241 ; SI-NEXT: s_mov_b32 s3, 0xf000
242 ; SI-NEXT: s_mov_b32 s2, -1
243 ; SI-NEXT: s_waitcnt lgkmcnt(0)
244 ; SI-NEXT: s_or_b32 s4, s6, 0x8000
245 ; SI-NEXT: v_mov_b32_e32 v0, s4
246 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
249 ; VI-LABEL: s_test_copysign_f16_neg1:
251 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
252 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
253 ; VI-NEXT: s_waitcnt lgkmcnt(0)
254 ; VI-NEXT: s_bitset1_b32 s2, 15
255 ; VI-NEXT: v_mov_b32_e32 v0, s0
256 ; VI-NEXT: v_mov_b32_e32 v1, s1
257 ; VI-NEXT: v_mov_b32_e32 v2, s2
258 ; VI-NEXT: flat_store_short v[0:1], v2
261 ; GFX9-LABEL: s_test_copysign_f16_neg1:
263 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
264 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
265 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
267 ; GFX9-NEXT: s_bitset1_b32 s2, 15
268 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
269 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
270 ; GFX9-NEXT: s_endpgm
272 ; GFX11-LABEL: s_test_copysign_f16_neg1:
274 ; GFX11-NEXT: s_clause 0x1
275 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
276 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
277 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
278 ; GFX11-NEXT: s_bitset1_b32 s2, 15
279 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
280 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
281 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
282 ; GFX11-NEXT: s_endpgm
283 %result = call half @llvm.copysign.f16(half %mag, half -1.0)
284 store half %result, ptr addrspace(1) %out, align 4
288 define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) {
289 ; SI-LABEL: s_test_copysign_f16_neg10:
291 ; SI-NEXT: s_load_dword s6, s[4:5], 0xb
292 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
293 ; SI-NEXT: s_mov_b32 s3, 0xf000
294 ; SI-NEXT: s_mov_b32 s2, -1
295 ; SI-NEXT: s_waitcnt lgkmcnt(0)
296 ; SI-NEXT: s_or_b32 s4, s6, 0x8000
297 ; SI-NEXT: v_mov_b32_e32 v0, s4
298 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
301 ; VI-LABEL: s_test_copysign_f16_neg10:
303 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
304 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
305 ; VI-NEXT: s_waitcnt lgkmcnt(0)
306 ; VI-NEXT: s_bitset1_b32 s2, 15
307 ; VI-NEXT: v_mov_b32_e32 v0, s0
308 ; VI-NEXT: v_mov_b32_e32 v1, s1
309 ; VI-NEXT: v_mov_b32_e32 v2, s2
310 ; VI-NEXT: flat_store_short v[0:1], v2
313 ; GFX9-LABEL: s_test_copysign_f16_neg10:
315 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
316 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
317 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
318 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
319 ; GFX9-NEXT: s_bitset1_b32 s2, 15
320 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
321 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
322 ; GFX9-NEXT: s_endpgm
324 ; GFX11-LABEL: s_test_copysign_f16_neg10:
326 ; GFX11-NEXT: s_clause 0x1
327 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
328 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: s_bitset1_b32 s2, 15
331 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
332 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
333 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
334 ; GFX11-NEXT: s_endpgm
335 %result = call half @llvm.copysign.f16(half %mag, half -10.0)
336 store half %result, ptr addrspace(1) %out, align 4
340 define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) {
341 ; SI-LABEL: s_test_copysign_f16_0_mag:
343 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
344 ; SI-NEXT: s_brev_b32 s2, -2
345 ; SI-NEXT: s_mov_b32 s3, 0xf000
346 ; SI-NEXT: s_waitcnt lgkmcnt(0)
347 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
348 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
349 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0
350 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
351 ; SI-NEXT: s_mov_b32 s2, -1
352 ; SI-NEXT: s_waitcnt lgkmcnt(0)
353 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
356 ; VI-LABEL: s_test_copysign_f16_0_mag:
358 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
359 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
360 ; VI-NEXT: s_waitcnt lgkmcnt(0)
361 ; VI-NEXT: s_and_b32 s2, s2, 0x8000
362 ; VI-NEXT: v_mov_b32_e32 v0, s0
363 ; VI-NEXT: v_mov_b32_e32 v1, s1
364 ; VI-NEXT: v_mov_b32_e32 v2, s2
365 ; VI-NEXT: flat_store_short v[0:1], v2
368 ; GFX9-LABEL: s_test_copysign_f16_0_mag:
370 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
371 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
372 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: s_and_b32 s2, s2, 0x8000
375 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
376 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
377 ; GFX9-NEXT: s_endpgm
379 ; GFX11-LABEL: s_test_copysign_f16_0_mag:
381 ; GFX11-NEXT: s_clause 0x1
382 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
383 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
384 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX11-NEXT: s_and_b32 s2, s2, 0x8000
386 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
387 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
388 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
389 ; GFX11-NEXT: s_endpgm
390 %result = call half @llvm.copysign.f16(half 0.0, half %sign)
391 store half %result, ptr addrspace(1) %out, align 4
396 define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) {
397 ; SI-LABEL: s_test_copysign_f16_1_mag:
399 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
400 ; SI-NEXT: s_brev_b32 s2, -2
401 ; SI-NEXT: s_mov_b32 s3, 0xf000
402 ; SI-NEXT: s_waitcnt lgkmcnt(0)
403 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
404 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
405 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0
406 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
407 ; SI-NEXT: s_mov_b32 s2, -1
408 ; SI-NEXT: s_waitcnt lgkmcnt(0)
409 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
412 ; VI-LABEL: s_test_copysign_f16_1_mag:
414 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
415 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
416 ; VI-NEXT: s_waitcnt lgkmcnt(0)
417 ; VI-NEXT: s_and_b32 s2, s2, 0x8000
418 ; VI-NEXT: s_or_b32 s2, s2, 0x3c00
419 ; VI-NEXT: v_mov_b32_e32 v0, s0
420 ; VI-NEXT: v_mov_b32_e32 v1, s1
421 ; VI-NEXT: v_mov_b32_e32 v2, s2
422 ; VI-NEXT: flat_store_short v[0:1], v2
425 ; GFX9-LABEL: s_test_copysign_f16_1_mag:
427 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
428 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
429 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
430 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX9-NEXT: s_and_b32 s2, s2, 0x8000
432 ; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00
433 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
434 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
435 ; GFX9-NEXT: s_endpgm
437 ; GFX11-LABEL: s_test_copysign_f16_1_mag:
439 ; GFX11-NEXT: s_clause 0x1
440 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
441 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
442 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
443 ; GFX11-NEXT: s_and_b32 s2, s2, 0x8000
444 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
445 ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00
446 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
447 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
448 ; GFX11-NEXT: s_endpgm
449 %result = call half @llvm.copysign.f16(half 1.0, half %sign)
450 store half %result, ptr addrspace(1) %out, align 4
454 define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) {
455 ; SI-LABEL: s_test_copysign_f16_10_mag:
457 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
458 ; SI-NEXT: s_brev_b32 s2, -2
459 ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000
460 ; SI-NEXT: s_mov_b32 s3, 0xf000
461 ; SI-NEXT: s_waitcnt lgkmcnt(0)
462 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
463 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
464 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
465 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
466 ; SI-NEXT: s_mov_b32 s2, -1
467 ; SI-NEXT: s_waitcnt lgkmcnt(0)
468 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
471 ; VI-LABEL: s_test_copysign_f16_10_mag:
473 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
474 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
475 ; VI-NEXT: s_waitcnt lgkmcnt(0)
476 ; VI-NEXT: s_and_b32 s2, s2, 0x8000
477 ; VI-NEXT: s_or_b32 s2, s2, 0x4900
478 ; VI-NEXT: v_mov_b32_e32 v0, s0
479 ; VI-NEXT: v_mov_b32_e32 v1, s1
480 ; VI-NEXT: v_mov_b32_e32 v2, s2
481 ; VI-NEXT: flat_store_short v[0:1], v2
484 ; GFX9-LABEL: s_test_copysign_f16_10_mag:
486 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
487 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
488 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
489 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
490 ; GFX9-NEXT: s_and_b32 s2, s2, 0x8000
491 ; GFX9-NEXT: s_or_b32 s2, s2, 0x4900
492 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
493 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
494 ; GFX9-NEXT: s_endpgm
496 ; GFX11-LABEL: s_test_copysign_f16_10_mag:
498 ; GFX11-NEXT: s_clause 0x1
499 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
500 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
501 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX11-NEXT: s_and_b32 s2, s2, 0x8000
503 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
504 ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900
505 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
506 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
507 ; GFX11-NEXT: s_endpgm
508 %result = call half @llvm.copysign.f16(half 10.0, half %sign)
509 store half %result, ptr addrspace(1) %out, align 4
513 define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) {
514 ; SI-LABEL: s_test_copysign_f16_neg1_mag:
516 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
517 ; SI-NEXT: s_brev_b32 s2, -2
518 ; SI-NEXT: s_mov_b32 s3, 0xf000
519 ; SI-NEXT: s_waitcnt lgkmcnt(0)
520 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
521 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
522 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0
523 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
524 ; SI-NEXT: s_mov_b32 s2, -1
525 ; SI-NEXT: s_waitcnt lgkmcnt(0)
526 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
529 ; VI-LABEL: s_test_copysign_f16_neg1_mag:
531 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
532 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
533 ; VI-NEXT: s_waitcnt lgkmcnt(0)
534 ; VI-NEXT: s_and_b32 s2, s2, 0x8000
535 ; VI-NEXT: s_or_b32 s2, s2, 0x3c00
536 ; VI-NEXT: v_mov_b32_e32 v0, s0
537 ; VI-NEXT: v_mov_b32_e32 v1, s1
538 ; VI-NEXT: v_mov_b32_e32 v2, s2
539 ; VI-NEXT: flat_store_short v[0:1], v2
542 ; GFX9-LABEL: s_test_copysign_f16_neg1_mag:
544 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
545 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
546 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
547 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
548 ; GFX9-NEXT: s_and_b32 s2, s2, 0x8000
549 ; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00
550 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
551 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
552 ; GFX9-NEXT: s_endpgm
554 ; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
556 ; GFX11-NEXT: s_clause 0x1
557 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
558 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
559 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
560 ; GFX11-NEXT: s_and_b32 s2, s2, 0x8000
561 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
562 ; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00
563 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
564 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
565 ; GFX11-NEXT: s_endpgm
566 %result = call half @llvm.copysign.f16(half -1.0, half %sign)
567 store half %result, ptr addrspace(1) %out, align 4
571 define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) {
572 ; SI-LABEL: s_test_copysign_f16_neg10_mag:
574 ; SI-NEXT: s_load_dword s0, s[4:5], 0xb
575 ; SI-NEXT: s_brev_b32 s2, -2
576 ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000
577 ; SI-NEXT: s_mov_b32 s3, 0xf000
578 ; SI-NEXT: s_waitcnt lgkmcnt(0)
579 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
580 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
581 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
582 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
583 ; SI-NEXT: s_mov_b32 s2, -1
584 ; SI-NEXT: s_waitcnt lgkmcnt(0)
585 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
588 ; VI-LABEL: s_test_copysign_f16_neg10_mag:
590 ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c
591 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
592 ; VI-NEXT: s_waitcnt lgkmcnt(0)
593 ; VI-NEXT: s_and_b32 s2, s2, 0x8000
594 ; VI-NEXT: s_or_b32 s2, s2, 0x4900
595 ; VI-NEXT: v_mov_b32_e32 v0, s0
596 ; VI-NEXT: v_mov_b32_e32 v1, s1
597 ; VI-NEXT: v_mov_b32_e32 v2, s2
598 ; VI-NEXT: flat_store_short v[0:1], v2
601 ; GFX9-LABEL: s_test_copysign_f16_neg10_mag:
603 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c
604 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
605 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
606 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
607 ; GFX9-NEXT: s_and_b32 s2, s2, 0x8000
608 ; GFX9-NEXT: s_or_b32 s2, s2, 0x4900
609 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
610 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
611 ; GFX9-NEXT: s_endpgm
613 ; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
615 ; GFX11-NEXT: s_clause 0x1
616 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
617 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
618 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
619 ; GFX11-NEXT: s_and_b32 s2, s2, 0x8000
620 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
621 ; GFX11-NEXT: s_or_b32 s2, s2, 0x4900
622 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
623 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
624 ; GFX11-NEXT: s_endpgm
625 %result = call half @llvm.copysign.f16(half -10.0, half %sign)
626 store half %result, ptr addrspace(1) %out, align 4
630 define half @v_copysign_f16(half %mag, half %sign) {
631 ; SI-LABEL: v_copysign_f16:
633 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
635 ; SI-NEXT: s_brev_b32 s4, -2
636 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
637 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
638 ; SI-NEXT: s_setpc_b64 s[30:31]
640 ; VI-LABEL: v_copysign_f16:
642 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643 ; VI-NEXT: s_movk_i32 s4, 0x7fff
644 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
645 ; VI-NEXT: s_setpc_b64 s[30:31]
647 ; GFX9-LABEL: v_copysign_f16:
649 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
650 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
651 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
652 ; GFX9-NEXT: s_setpc_b64 s[30:31]
654 ; GFX11-LABEL: v_copysign_f16:
656 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
657 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
658 ; GFX11-NEXT: s_setpc_b64 s[30:31]
659 %result = call half @llvm.copysign.f16(half %mag, half %sign)
663 define half @v_test_copysign_f16_0(half %mag) {
664 ; SI-LABEL: v_test_copysign_f16_0:
666 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
667 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
668 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
669 ; SI-NEXT: s_setpc_b64 s[30:31]
671 ; VI-LABEL: v_test_copysign_f16_0:
673 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
674 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
675 ; VI-NEXT: s_setpc_b64 s[30:31]
677 ; GFX9-LABEL: v_test_copysign_f16_0:
679 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
680 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
681 ; GFX9-NEXT: s_setpc_b64 s[30:31]
683 ; GFX11-LABEL: v_test_copysign_f16_0:
685 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
686 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
687 ; GFX11-NEXT: s_setpc_b64 s[30:31]
688 %result = call half @llvm.copysign.f16(half %mag, half 0.0)
692 define half @v_test_copysign_f16_1(half %mag) {
693 ; SI-LABEL: v_test_copysign_f16_1:
695 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
697 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
698 ; SI-NEXT: s_setpc_b64 s[30:31]
700 ; VI-LABEL: v_test_copysign_f16_1:
702 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
704 ; VI-NEXT: s_setpc_b64 s[30:31]
706 ; GFX9-LABEL: v_test_copysign_f16_1:
708 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
710 ; GFX9-NEXT: s_setpc_b64 s[30:31]
712 ; GFX11-LABEL: v_test_copysign_f16_1:
714 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
715 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
716 ; GFX11-NEXT: s_setpc_b64 s[30:31]
717 %result = call half @llvm.copysign.f16(half %mag, half 1.0)
721 define half @v_test_copysign_f16_10(half %mag) {
722 ; SI-LABEL: v_test_copysign_f16_10:
724 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
726 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
727 ; SI-NEXT: s_setpc_b64 s[30:31]
729 ; VI-LABEL: v_test_copysign_f16_10:
731 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
732 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
733 ; VI-NEXT: s_setpc_b64 s[30:31]
735 ; GFX9-LABEL: v_test_copysign_f16_10:
737 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
739 ; GFX9-NEXT: s_setpc_b64 s[30:31]
741 ; GFX11-LABEL: v_test_copysign_f16_10:
743 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
745 ; GFX11-NEXT: s_setpc_b64 s[30:31]
746 %result = call half @llvm.copysign.f16(half %mag, half 10.0)
750 define half @v_test_copysign_f16_neg1(half %mag) {
751 ; SI-LABEL: v_test_copysign_f16_neg1:
753 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
755 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
756 ; SI-NEXT: s_setpc_b64 s[30:31]
758 ; VI-LABEL: v_test_copysign_f16_neg1:
760 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761 ; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
762 ; VI-NEXT: s_setpc_b64 s[30:31]
764 ; GFX9-LABEL: v_test_copysign_f16_neg1:
766 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
767 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
768 ; GFX9-NEXT: s_setpc_b64 s[30:31]
770 ; GFX11-LABEL: v_test_copysign_f16_neg1:
772 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
773 ; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
774 ; GFX11-NEXT: s_setpc_b64 s[30:31]
775 %result = call half @llvm.copysign.f16(half %mag, half -1.0)
779 define half @v_test_copysign_f16_neg10(half %mag) {
780 ; SI-LABEL: v_test_copysign_f16_neg10:
782 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
783 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
784 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
785 ; SI-NEXT: s_setpc_b64 s[30:31]
787 ; VI-LABEL: v_test_copysign_f16_neg10:
789 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790 ; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
791 ; VI-NEXT: s_setpc_b64 s[30:31]
793 ; GFX9-LABEL: v_test_copysign_f16_neg10:
795 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
796 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
797 ; GFX9-NEXT: s_setpc_b64 s[30:31]
799 ; GFX11-LABEL: v_test_copysign_f16_neg10:
801 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
802 ; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
803 ; GFX11-NEXT: s_setpc_b64 s[30:31]
804 %result = call half @llvm.copysign.f16(half %mag, half -10.0)
808 define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
809 ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
811 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
812 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
813 ; SI-NEXT: s_mov_b32 s11, 0xf000
814 ; SI-NEXT: s_mov_b32 s14, 0
815 ; SI-NEXT: s_mov_b32 s15, s11
816 ; SI-NEXT: s_waitcnt lgkmcnt(0)
817 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
818 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
819 ; SI-NEXT: v_mov_b32_e32 v2, 0
820 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
821 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
822 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
823 ; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
824 ; SI-NEXT: s_mov_b32 s8, s0
825 ; SI-NEXT: s_brev_b32 s0, -2
826 ; SI-NEXT: s_mov_b32 s10, -1
827 ; SI-NEXT: s_mov_b32 s9, s1
828 ; SI-NEXT: s_waitcnt vmcnt(1)
829 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
830 ; SI-NEXT: s_waitcnt vmcnt(0)
831 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
832 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
835 ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
837 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
838 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
839 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
840 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
841 ; VI-NEXT: s_waitcnt lgkmcnt(0)
842 ; VI-NEXT: v_mov_b32_e32 v2, s3
843 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
844 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
845 ; VI-NEXT: flat_load_ushort v2, v[1:2]
846 ; VI-NEXT: v_mov_b32_e32 v1, s5
847 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
848 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
849 ; VI-NEXT: flat_load_dword v3, v[0:1]
850 ; VI-NEXT: v_mov_b32_e32 v0, s0
851 ; VI-NEXT: s_brev_b32 s0, -2
852 ; VI-NEXT: v_mov_b32_e32 v1, s1
853 ; VI-NEXT: s_waitcnt vmcnt(1)
854 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
855 ; VI-NEXT: s_waitcnt vmcnt(0)
856 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
857 ; VI-NEXT: flat_store_dword v[0:1], v2
860 ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
862 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
863 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
864 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
865 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
866 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
867 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
868 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
869 ; GFX9-NEXT: s_brev_b32 s2, -2
870 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
871 ; GFX9-NEXT: s_waitcnt vmcnt(1)
872 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
873 ; GFX9-NEXT: s_waitcnt vmcnt(0)
874 ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
875 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
876 ; GFX9-NEXT: s_endpgm
878 ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
880 ; GFX11-NEXT: s_clause 0x1
881 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
882 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
883 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
884 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
885 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
886 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
887 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
888 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
889 ; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
890 ; GFX11-NEXT: s_waitcnt vmcnt(1)
891 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
892 ; GFX11-NEXT: s_waitcnt vmcnt(0)
893 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
894 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
895 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
896 ; GFX11-NEXT: s_endpgm
897 %tid = call i32 @llvm.amdgcn.workitem.id.x()
898 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
899 %mag = load half, ptr addrspace(1) %arg_mag_gep
900 %mag.ext = fpext half %mag to float
901 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
902 %sign = load float, ptr addrspace(1) %arg_sign_gep
903 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
904 store float %out, ptr addrspace(1) %arg_out
908 define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
909 ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
911 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
912 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
913 ; SI-NEXT: s_mov_b32 s11, 0xf000
914 ; SI-NEXT: s_mov_b32 s14, 0
915 ; SI-NEXT: s_mov_b32 s15, s11
916 ; SI-NEXT: s_waitcnt lgkmcnt(0)
917 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
918 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
919 ; SI-NEXT: v_mov_b32_e32 v2, 0
920 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
921 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
922 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
923 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[1:2], s[4:7], 0 addr64
924 ; SI-NEXT: s_mov_b32 s8, s0
925 ; SI-NEXT: s_brev_b32 s0, -2
926 ; SI-NEXT: s_mov_b32 s10, -1
927 ; SI-NEXT: s_mov_b32 s9, s1
928 ; SI-NEXT: s_waitcnt vmcnt(0)
929 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3
930 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
931 ; SI-NEXT: v_bfi_b32 v3, s0, v3, v1
932 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
935 ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
937 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
938 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
939 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
940 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
941 ; VI-NEXT: s_waitcnt lgkmcnt(0)
942 ; VI-NEXT: v_mov_b32_e32 v2, s3
943 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
944 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
945 ; VI-NEXT: flat_load_ushort v2, v[1:2]
946 ; VI-NEXT: v_mov_b32_e32 v1, s5
947 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
948 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
949 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
950 ; VI-NEXT: v_mov_b32_e32 v4, s0
951 ; VI-NEXT: s_brev_b32 s0, -2
952 ; VI-NEXT: v_mov_b32_e32 v5, s1
953 ; VI-NEXT: s_waitcnt vmcnt(0)
954 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v2
955 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
956 ; VI-NEXT: v_bfi_b32 v3, s0, v3, v1
957 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
960 ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
962 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
963 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
964 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
965 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
966 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX9-NEXT: global_load_ushort v2, v1, s[2:3]
969 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
970 ; GFX9-NEXT: s_brev_b32 s2, -2
971 ; GFX9-NEXT: s_waitcnt vmcnt(0)
972 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2
973 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
974 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
975 ; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v1
976 ; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
977 ; GFX9-NEXT: s_endpgm
979 ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
981 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
982 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
983 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
984 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
985 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
986 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
987 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
988 ; GFX11-NEXT: global_load_u16 v2, v1, s[2:3]
989 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
990 ; GFX11-NEXT: s_waitcnt vmcnt(0)
991 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
992 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
993 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
994 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
995 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1
996 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
997 ; GFX11-NEXT: s_endpgm
998 %tid = call i32 @llvm.amdgcn.workitem.id.x()
999 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1000 %mag = load half, ptr addrspace(1) %arg_mag_gep
1001 %mag.ext = fpext half %mag to double
1002 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1003 %sign = load double, ptr addrspace(1) %arg_sign_gep
1004 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
1005 store double %out, ptr addrspace(1) %arg_out
1009 define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1010 ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1012 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1013 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1014 ; SI-NEXT: s_mov_b32 s11, 0xf000
1015 ; SI-NEXT: s_mov_b32 s14, 0
1016 ; SI-NEXT: s_mov_b32 s15, s11
1017 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1018 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
1019 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1020 ; SI-NEXT: v_mov_b32_e32 v2, 0
1021 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1022 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
1023 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1024 ; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64
1025 ; SI-NEXT: s_mov_b32 s8, s0
1026 ; SI-NEXT: s_brev_b32 s0, -2
1027 ; SI-NEXT: s_mov_b32 s10, -1
1028 ; SI-NEXT: s_mov_b32 s9, s1
1029 ; SI-NEXT: s_waitcnt vmcnt(0)
1030 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1031 ; SI-NEXT: v_bfi_b32 v0, s0, v3, v0
1032 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1035 ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1037 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1038 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1039 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1040 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1041 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1042 ; VI-NEXT: v_mov_b32_e32 v3, s3
1043 ; VI-NEXT: v_mov_b32_e32 v1, s5
1044 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1045 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1046 ; VI-NEXT: flat_load_ushort v4, v[0:1]
1047 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1048 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1049 ; VI-NEXT: flat_load_dword v2, v[0:1]
1050 ; VI-NEXT: v_mov_b32_e32 v0, s0
1051 ; VI-NEXT: s_brev_b32 s0, -2
1052 ; VI-NEXT: v_mov_b32_e32 v1, s1
1053 ; VI-NEXT: s_waitcnt vmcnt(1)
1054 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1055 ; VI-NEXT: s_waitcnt vmcnt(0)
1056 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1057 ; VI-NEXT: flat_store_dword v[0:1], v2
1060 ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1062 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1063 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1064 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1065 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1066 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1067 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1068 ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
1069 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1070 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1071 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
1072 ; GFX9-NEXT: s_brev_b32 s2, -2
1073 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1074 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
1075 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
1076 ; GFX9-NEXT: s_endpgm
1078 ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1080 ; GFX11-NEXT: s_clause 0x1
1081 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1082 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1083 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1084 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1085 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
1086 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1087 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1088 ; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
1089 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1090 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1091 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1092 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1094 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
1095 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
1096 ; GFX11-NEXT: s_endpgm
1097 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1098 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1099 %mag = load float, ptr addrspace(1) %arg_mag_gep
1100 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1101 %sign = load half, ptr addrspace(1) %arg_sign_gep
1102 %sign.ext = fpext half %sign to float
1103 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
1104 store float %out, ptr addrspace(1) %arg_out
1108 define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1109 ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1111 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1112 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1113 ; SI-NEXT: s_mov_b32 s11, 0xf000
1114 ; SI-NEXT: s_mov_b32 s14, 0
1115 ; SI-NEXT: s_mov_b32 s15, s11
1116 ; SI-NEXT: v_mov_b32_e32 v1, 0
1117 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
1118 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1119 ; SI-NEXT: v_mov_b32_e32 v3, v1
1120 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1121 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
1122 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
1123 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1124 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
1125 ; SI-NEXT: s_mov_b32 s8, s0
1126 ; SI-NEXT: s_brev_b32 s0, -2
1127 ; SI-NEXT: s_mov_b32 s10, -1
1128 ; SI-NEXT: s_mov_b32 s9, s1
1129 ; SI-NEXT: s_waitcnt vmcnt(1)
1130 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1131 ; SI-NEXT: s_waitcnt vmcnt(0)
1132 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
1133 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1136 ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1138 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1139 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1140 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1141 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1142 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1143 ; VI-NEXT: v_mov_b32_e32 v3, s3
1144 ; VI-NEXT: v_mov_b32_e32 v1, s5
1145 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1146 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1147 ; VI-NEXT: flat_load_ushort v4, v[0:1]
1148 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1149 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1150 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1151 ; VI-NEXT: v_mov_b32_e32 v2, s0
1152 ; VI-NEXT: s_brev_b32 s0, -2
1153 ; VI-NEXT: v_mov_b32_e32 v3, s1
1154 ; VI-NEXT: s_waitcnt vmcnt(1)
1155 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1156 ; VI-NEXT: s_waitcnt vmcnt(0)
1157 ; VI-NEXT: v_bfi_b32 v1, s0, v1, v4
1158 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1161 ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1163 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1164 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1165 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1166 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1167 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1168 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7]
1170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1171 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1172 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1173 ; GFX9-NEXT: s_brev_b32 s2, -2
1174 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1175 ; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
1176 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
1177 ; GFX9-NEXT: s_endpgm
1179 ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1181 ; GFX11-NEXT: s_clause 0x1
1182 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1183 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1184 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1185 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1186 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1187 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1188 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1189 ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
1190 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1191 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1192 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1193 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1194 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1195 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
1196 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
1197 ; GFX11-NEXT: s_endpgm
1198 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1199 %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
1200 %mag = load double, ptr addrspace(1) %arg_mag_gep
1201 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1202 %sign = load half, ptr addrspace(1) %arg_sign_gep
1203 %sign.ext = fpext half %sign to double
1204 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
1205 store double %out, ptr addrspace(1) %arg_out
1209 define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1210 ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1212 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1213 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1214 ; SI-NEXT: s_mov_b32 s11, 0xf000
1215 ; SI-NEXT: s_mov_b32 s14, 0
1216 ; SI-NEXT: s_mov_b32 s15, s11
1217 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1218 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
1219 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1220 ; SI-NEXT: v_mov_b32_e32 v2, 0
1221 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
1222 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
1223 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1224 ; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64
1225 ; SI-NEXT: s_brev_b32 s2, -2
1226 ; SI-NEXT: s_mov_b32 s10, -1
1227 ; SI-NEXT: s_mov_b32 s8, s0
1228 ; SI-NEXT: s_mov_b32 s9, s1
1229 ; SI-NEXT: s_waitcnt vmcnt(1)
1230 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
1231 ; SI-NEXT: s_waitcnt vmcnt(0)
1232 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
1233 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1234 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
1237 ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1239 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1240 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1241 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1242 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1243 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1244 ; VI-NEXT: v_mov_b32_e32 v3, s3
1245 ; VI-NEXT: v_mov_b32_e32 v1, s5
1246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1248 ; VI-NEXT: flat_load_dword v4, v[0:1]
1249 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
1250 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1251 ; VI-NEXT: flat_load_ushort v2, v[0:1]
1252 ; VI-NEXT: v_mov_b32_e32 v0, s0
1253 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1254 ; VI-NEXT: v_mov_b32_e32 v1, s1
1255 ; VI-NEXT: s_waitcnt vmcnt(1)
1256 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1257 ; VI-NEXT: s_waitcnt vmcnt(0)
1258 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1259 ; VI-NEXT: flat_store_short v[0:1], v2
1262 ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1264 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1265 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1266 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1267 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1268 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1269 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1270 ; GFX9-NEXT: global_load_dword v1, v1, s[6:7]
1271 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1272 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1273 ; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
1274 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
1275 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1276 ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
1277 ; GFX9-NEXT: global_store_short v2, v0, s[0:1]
1278 ; GFX9-NEXT: s_endpgm
1280 ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1282 ; GFX11-NEXT: s_clause 0x1
1283 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1284 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1285 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1286 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1287 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1288 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1289 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1290 ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
1291 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1292 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1293 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1294 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1296 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
1297 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1298 ; GFX11-NEXT: s_endpgm
1299 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1300 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1301 %mag = load half, ptr addrspace(1) %arg_mag_gep
1302 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
1303 %sign = load float, ptr addrspace(1) %arg_sign_gep
1304 %sign.trunc = fptrunc float %sign to half
1305 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1306 store half %out, ptr addrspace(1) %arg_out
1310 define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1311 ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1313 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1314 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
1315 ; SI-NEXT: s_mov_b32 s7, 0xf000
1316 ; SI-NEXT: s_mov_b32 s6, -1
1317 ; SI-NEXT: s_mov_b32 s14, s6
1318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1319 ; SI-NEXT: s_mov_b32 s12, s2
1320 ; SI-NEXT: s_mov_b32 s13, s3
1321 ; SI-NEXT: s_mov_b32 s15, s7
1322 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
1323 ; SI-NEXT: s_mov_b32 s10, 0
1324 ; SI-NEXT: s_mov_b32 s11, s7
1325 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1326 ; SI-NEXT: v_mov_b32_e32 v1, 0
1327 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
1328 ; SI-NEXT: s_brev_b32 s2, -2
1329 ; SI-NEXT: s_mov_b32 s4, s0
1330 ; SI-NEXT: s_mov_b32 s5, s1
1331 ; SI-NEXT: s_waitcnt vmcnt(0)
1332 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
1333 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
1334 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1335 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
1338 ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1340 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1341 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1342 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1343 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1344 ; VI-NEXT: v_mov_b32_e32 v0, s2
1345 ; VI-NEXT: v_mov_b32_e32 v2, s5
1346 ; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
1347 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1348 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
1349 ; VI-NEXT: s_waitcnt vmcnt(0)
1350 ; VI-NEXT: v_mov_b32_e32 v1, s3
1351 ; VI-NEXT: flat_load_ushort v3, v[0:1]
1352 ; VI-NEXT: v_mov_b32_e32 v0, s0
1353 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1354 ; VI-NEXT: v_mov_b32_e32 v1, s1
1355 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1356 ; VI-NEXT: s_waitcnt vmcnt(0)
1357 ; VI-NEXT: v_bfi_b32 v2, s0, v3, v2
1358 ; VI-NEXT: flat_store_short v[0:1], v2
1361 ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1363 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1364 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1365 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1366 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1367 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
1368 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1369 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1370 ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
1371 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
1372 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1374 ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
1375 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1376 ; GFX9-NEXT: s_endpgm
1378 ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1380 ; GFX11-NEXT: s_clause 0x1
1381 ; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
1382 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1383 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1384 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1385 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1386 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1387 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1388 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
1389 ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
1390 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1391 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1392 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1393 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1394 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
1395 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1396 ; GFX11-NEXT: s_endpgm
1397 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1398 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1399 %mag = load half, ptr addrspace(1) %arg_mag
1400 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1401 %sign = load double, ptr addrspace(1) %arg_sign_gep
1402 %sign.trunc = fptrunc double %sign to half
1403 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1404 store half %out, ptr addrspace(1) %arg_out
1408 define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1409 ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1411 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1412 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
1413 ; SI-NEXT: s_mov_b32 s11, 0xf000
1414 ; SI-NEXT: s_mov_b32 s14, 0
1415 ; SI-NEXT: s_mov_b32 s15, s11
1416 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1417 ; SI-NEXT: s_mov_b64 s[12:13], s[2:3]
1418 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1419 ; SI-NEXT: v_mov_b32_e32 v2, 0
1420 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1421 ; SI-NEXT: s_mov_b64 s[6:7], s[14:15]
1422 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1423 ; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64
1424 ; SI-NEXT: s_brev_b32 s2, -2
1425 ; SI-NEXT: s_mov_b32 s10, -1
1426 ; SI-NEXT: s_mov_b32 s8, s0
1427 ; SI-NEXT: s_mov_b32 s9, s1
1428 ; SI-NEXT: s_waitcnt vmcnt(1)
1429 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
1430 ; SI-NEXT: s_waitcnt vmcnt(0)
1431 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1432 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1433 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
1434 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1435 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
1438 ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1440 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1441 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
1442 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1443 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1444 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1445 ; VI-NEXT: v_mov_b32_e32 v2, s3
1446 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
1447 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1448 ; VI-NEXT: flat_load_dword v2, v[1:2]
1449 ; VI-NEXT: v_mov_b32_e32 v1, s5
1450 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1451 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1452 ; VI-NEXT: flat_load_ushort v3, v[0:1]
1453 ; VI-NEXT: v_mov_b32_e32 v0, s0
1454 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1455 ; VI-NEXT: v_mov_b32_e32 v1, s1
1456 ; VI-NEXT: s_waitcnt vmcnt(1)
1457 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1458 ; VI-NEXT: s_waitcnt vmcnt(0)
1459 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1460 ; VI-NEXT: flat_store_short v[0:1], v2
1463 ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1465 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1466 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
1467 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1468 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1469 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1470 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1471 ; GFX9-NEXT: global_load_dword v1, v1, s[2:3]
1472 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
1473 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7]
1474 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1475 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
1476 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1477 ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
1478 ; GFX9-NEXT: global_store_short v2, v0, s[0:1]
1479 ; GFX9-NEXT: s_endpgm
1481 ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1483 ; GFX11-NEXT: s_clause 0x1
1484 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1485 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
1486 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
1487 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1488 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1489 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1490 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1491 ; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
1492 ; GFX11-NEXT: global_load_u16 v0, v0, s[4:5]
1493 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1494 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
1495 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1496 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1497 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
1498 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1499 ; GFX11-NEXT: s_endpgm
1500 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1501 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1502 %mag = load float, ptr addrspace(1) %arg_mag_gep
1503 %mag.trunc = fptrunc float %mag to half
1504 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1505 %sign = load half, ptr addrspace(1) %arg_sign_gep
1506 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1507 store half %out, ptr addrspace(1) %arg_out
1511 define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) {
1512 ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1514 ; SI-NEXT: s_load_dword s6, s[4:5], 0xd
1515 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1516 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1517 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
1518 ; SI-NEXT: s_lshr_b32 s4, s3, 8
1519 ; SI-NEXT: s_and_b32 s5, s3, 0x1ff
1520 ; SI-NEXT: s_and_b32 s6, s4, 0xffe
1521 ; SI-NEXT: s_or_b32 s2, s5, s2
1522 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1523 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1524 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1525 ; SI-NEXT: v_readfirstlane_b32 s2, v1
1526 ; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014
1527 ; SI-NEXT: s_or_b32 s2, s6, s2
1528 ; SI-NEXT: s_sub_i32 s6, 0x3f1, s5
1529 ; SI-NEXT: v_med3_i32 v1, s6, 0, 13
1530 ; SI-NEXT: s_or_b32 s4, s2, 0x1000
1531 ; SI-NEXT: v_readfirstlane_b32 s6, v1
1532 ; SI-NEXT: s_lshr_b32 s6, s4, s6
1533 ; SI-NEXT: v_lshl_b32_e32 v1, s6, v1
1534 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1535 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
1536 ; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10
1537 ; SI-NEXT: v_readfirstlane_b32 s4, v1
1538 ; SI-NEXT: s_lshl_b32 s5, s8, 12
1539 ; SI-NEXT: s_or_b32 s4, s6, s4
1540 ; SI-NEXT: s_or_b32 s5, s2, s5
1541 ; SI-NEXT: s_cmp_lt_i32 s8, 1
1542 ; SI-NEXT: s_cselect_b32 s9, s4, s5
1543 ; SI-NEXT: s_and_b32 s6, s9, 7
1544 ; SI-NEXT: s_cmp_gt_i32 s6, 5
1545 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1546 ; SI-NEXT: s_cmp_eq_u32 s6, 3
1547 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
1548 ; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
1549 ; SI-NEXT: s_lshr_b32 s6, s9, 2
1550 ; SI-NEXT: s_or_b32 s4, s4, s5
1551 ; SI-NEXT: s_cmp_lg_u32 s4, 0
1552 ; SI-NEXT: s_addc_u32 s4, s6, 0
1553 ; SI-NEXT: s_cmp_lt_i32 s8, 31
1554 ; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00
1555 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1556 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1557 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1558 ; SI-NEXT: v_lshlrev_b32_e32 v1, 9, v1
1559 ; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f
1560 ; SI-NEXT: v_or_b32_e32 v1, 0x7c00, v1
1561 ; SI-NEXT: v_mov_b32_e32 v2, s6
1562 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1563 ; SI-NEXT: s_lshr_b32 s2, s3, 16
1564 ; SI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
1565 ; SI-NEXT: s_and_b32 s2, s2, 0x8000
1566 ; SI-NEXT: v_or_b32_e32 v1, s2, v1
1567 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1568 ; SI-NEXT: s_brev_b32 s2, -2
1569 ; SI-NEXT: s_mov_b32 s3, 0xf000
1570 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
1571 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1572 ; SI-NEXT: s_mov_b32 s2, -1
1573 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1576 ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1578 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1579 ; VI-NEXT: s_load_dword s4, s[4:5], 0x34
1580 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1581 ; VI-NEXT: v_mov_b32_e32 v0, s0
1582 ; VI-NEXT: v_mov_b32_e32 v1, s1
1583 ; VI-NEXT: s_lshr_b32 s0, s3, 8
1584 ; VI-NEXT: s_and_b32 s1, s3, 0x1ff
1585 ; VI-NEXT: s_and_b32 s5, s0, 0xffe
1586 ; VI-NEXT: s_or_b32 s0, s1, s2
1587 ; VI-NEXT: s_cmp_lg_u32 s0, 0
1588 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1589 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1590 ; VI-NEXT: s_bfe_u32 s1, s3, 0xb0014
1591 ; VI-NEXT: v_readfirstlane_b32 s0, v2
1592 ; VI-NEXT: s_sub_i32 s2, 0x3f1, s1
1593 ; VI-NEXT: s_or_b32 s5, s5, s0
1594 ; VI-NEXT: v_med3_i32 v2, s2, 0, 13
1595 ; VI-NEXT: s_or_b32 s0, s5, 0x1000
1596 ; VI-NEXT: v_readfirstlane_b32 s2, v2
1597 ; VI-NEXT: s_lshr_b32 s2, s0, s2
1598 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
1599 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, s0, v2
1600 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1601 ; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10
1602 ; VI-NEXT: v_readfirstlane_b32 s0, v2
1603 ; VI-NEXT: s_lshl_b32 s1, s6, 12
1604 ; VI-NEXT: s_or_b32 s0, s2, s0
1605 ; VI-NEXT: s_or_b32 s1, s5, s1
1606 ; VI-NEXT: s_cmp_lt_i32 s6, 1
1607 ; VI-NEXT: s_cselect_b32 s7, s0, s1
1608 ; VI-NEXT: s_and_b32 s2, s7, 7
1609 ; VI-NEXT: s_cmp_gt_i32 s2, 5
1610 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1611 ; VI-NEXT: s_cmp_eq_u32 s2, 3
1612 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
1613 ; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1614 ; VI-NEXT: s_lshr_b32 s2, s7, 2
1615 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
1616 ; VI-NEXT: s_addc_u32 s0, s2, 0
1617 ; VI-NEXT: s_cmp_lt_i32 s6, 31
1618 ; VI-NEXT: s_cselect_b32 s2, s0, 0x7c00
1619 ; VI-NEXT: s_cmp_lg_u32 s5, 0
1620 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1621 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1622 ; VI-NEXT: v_lshlrev_b32_e32 v2, 9, v2
1623 ; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f
1624 ; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2
1625 ; VI-NEXT: v_mov_b32_e32 v3, s2
1626 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1627 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
1628 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1629 ; VI-NEXT: v_mov_b32_e32 v3, s4
1630 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1631 ; VI-NEXT: flat_store_short v[0:1], v2
1634 ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1636 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1637 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34
1638 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1639 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1640 ; GFX9-NEXT: s_lshr_b32 s4, s3, 8
1641 ; GFX9-NEXT: s_and_b32 s5, s3, 0x1ff
1642 ; GFX9-NEXT: s_and_b32 s7, s4, 0xffe
1643 ; GFX9-NEXT: s_or_b32 s2, s5, s2
1644 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0
1645 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
1646 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1647 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0xb0014
1648 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
1649 ; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3
1650 ; GFX9-NEXT: s_or_b32 s7, s7, s2
1651 ; GFX9-NEXT: v_med3_i32 v1, s4, 0, 13
1652 ; GFX9-NEXT: s_or_b32 s2, s7, 0x1000
1653 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1
1654 ; GFX9-NEXT: s_lshr_b32 s4, s2, s4
1655 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4
1656 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s2, v1
1657 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
1658 ; GFX9-NEXT: s_add_i32 s8, s3, 0xfffffc10
1659 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
1660 ; GFX9-NEXT: s_lshl_b32 s3, s8, 12
1661 ; GFX9-NEXT: s_or_b32 s2, s4, s2
1662 ; GFX9-NEXT: s_or_b32 s3, s7, s3
1663 ; GFX9-NEXT: s_cmp_lt_i32 s8, 1
1664 ; GFX9-NEXT: s_cselect_b32 s9, s2, s3
1665 ; GFX9-NEXT: s_and_b32 s4, s9, 7
1666 ; GFX9-NEXT: s_cmp_gt_i32 s4, 5
1667 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
1668 ; GFX9-NEXT: s_cmp_eq_u32 s4, 3
1669 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
1670 ; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
1671 ; GFX9-NEXT: s_lshr_b32 s4, s9, 2
1672 ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
1673 ; GFX9-NEXT: s_addc_u32 s2, s4, 0
1674 ; GFX9-NEXT: s_cmp_lt_i32 s8, 31
1675 ; GFX9-NEXT: s_cselect_b32 s4, s2, 0x7c00
1676 ; GFX9-NEXT: s_cmp_lg_u32 s7, 0
1677 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
1678 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
1679 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1
1680 ; GFX9-NEXT: s_cmpk_eq_i32 s8, 0x40f
1681 ; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1
1682 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
1683 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
1684 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
1685 ; GFX9-NEXT: s_movk_i32 s2, 0x7fff
1686 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1687 ; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
1688 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1689 ; GFX9-NEXT: s_endpgm
1691 ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1693 ; GFX11-NEXT: s_clause 0x1
1694 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1695 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
1696 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1697 ; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
1698 ; GFX11-NEXT: s_lshr_b32 s6, s3, 8
1699 ; GFX11-NEXT: s_or_b32 s2, s5, s2
1700 ; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
1701 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1702 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
1703 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1704 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1705 ; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
1706 ; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2
1707 ; GFX11-NEXT: s_addk_i32 s2, 0xfc10
1708 ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
1709 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1710 ; GFX11-NEXT: s_lshl_b32 s7, s2, 12
1711 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1712 ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
1713 ; GFX11-NEXT: s_or_b32 s3, s5, s3
1714 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1715 ; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
1716 ; GFX11-NEXT: s_or_b32 s7, s3, s7
1717 ; GFX11-NEXT: s_lshr_b32 s6, s5, s6
1718 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1719 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
1720 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1721 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
1722 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1723 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1724 ; GFX11-NEXT: v_readfirstlane_b32 s5, v0
1725 ; GFX11-NEXT: s_or_b32 s5, s6, s5
1726 ; GFX11-NEXT: s_cmp_lt_i32 s2, 1
1727 ; GFX11-NEXT: s_cselect_b32 s5, s5, s7
1728 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1729 ; GFX11-NEXT: s_and_b32 s6, s5, 7
1730 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5
1731 ; GFX11-NEXT: s_cselect_b32 s7, -1, 0
1732 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
1733 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0
1734 ; GFX11-NEXT: s_lshr_b32 s5, s5, 2
1735 ; GFX11-NEXT: s_or_b32 s6, s6, s7
1736 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1737 ; GFX11-NEXT: s_cmp_lg_u32 s6, 0
1738 ; GFX11-NEXT: s_addc_u32 s5, s5, 0
1739 ; GFX11-NEXT: s_cmp_lt_i32 s2, 31
1740 ; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
1741 ; GFX11-NEXT: s_cmp_lg_u32 s3, 0
1742 ; GFX11-NEXT: s_cselect_b32 s3, -1, 0
1743 ; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
1744 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3
1745 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1746 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1747 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0
1748 ; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0
1749 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1750 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
1751 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
1752 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
1753 ; GFX11-NEXT: s_endpgm
1754 %mag.trunc = fptrunc double %mag to half
1755 %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1756 store half %result, ptr addrspace(1) %arg_out
1760 define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) {
1761 ; SI-LABEL: s_copysign_v2f16:
1763 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1764 ; SI-NEXT: s_mov_b32 s7, 0xf000
1765 ; SI-NEXT: s_mov_b32 s6, -1
1766 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1767 ; SI-NEXT: s_lshr_b32 s4, s2, 16
1768 ; SI-NEXT: s_lshr_b32 s5, s3, 16
1769 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
1770 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
1771 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
1772 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s3
1773 ; SI-NEXT: s_brev_b32 s2, -2
1774 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
1775 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1776 ; SI-NEXT: v_bfi_b32 v1, s2, v2, v3
1777 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1778 ; SI-NEXT: s_mov_b32 s4, s0
1779 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1780 ; SI-NEXT: s_mov_b32 s5, s1
1781 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1782 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1785 ; VI-LABEL: s_copysign_v2f16:
1787 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1788 ; VI-NEXT: s_movk_i32 s4, 0x7fff
1789 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1790 ; VI-NEXT: v_mov_b32_e32 v0, s2
1791 ; VI-NEXT: v_mov_b32_e32 v1, s3
1792 ; VI-NEXT: s_lshr_b32 s3, s3, 16
1793 ; VI-NEXT: s_lshr_b32 s2, s2, 16
1794 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1795 ; VI-NEXT: v_mov_b32_e32 v1, s2
1796 ; VI-NEXT: v_mov_b32_e32 v2, s3
1797 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
1798 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1799 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1800 ; VI-NEXT: v_mov_b32_e32 v0, s0
1801 ; VI-NEXT: v_mov_b32_e32 v1, s1
1802 ; VI-NEXT: flat_store_dword v[0:1], v2
1805 ; GFX9-LABEL: s_copysign_v2f16:
1807 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1808 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
1809 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1810 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1811 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1812 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
1813 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16
1814 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
1815 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
1816 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1817 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1818 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
1819 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1820 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
1821 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1822 ; GFX9-NEXT: s_endpgm
1824 ; GFX11-LABEL: s_copysign_v2f16:
1826 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1827 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1828 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1829 ; GFX11-NEXT: v_mov_b32_e32 v0, s3
1830 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16
1831 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1832 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
1833 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
1834 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
1835 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1836 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
1837 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1838 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1839 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1840 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
1841 ; GFX11-NEXT: s_endpgm
1842 %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
1843 store <2 x half> %out, ptr addrspace(1) %arg_out
1847 define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) {
1848 ; SI-LABEL: s_copysign_v3f16:
1850 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
1851 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1852 ; SI-NEXT: s_mov_b32 s7, 0xf000
1853 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1854 ; SI-NEXT: s_lshr_b32 s6, s0, 16
1855 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
1856 ; SI-NEXT: s_lshr_b32 s0, s2, 16
1857 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
1858 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
1859 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
1860 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s3
1861 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s2
1862 ; SI-NEXT: s_brev_b32 s0, -2
1863 ; SI-NEXT: v_bfi_b32 v2, s0, v2, v3
1864 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1865 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v5
1866 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v4
1867 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1868 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1869 ; SI-NEXT: s_mov_b32 s6, -1
1870 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1871 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
1872 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1873 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
1876 ; VI-LABEL: s_copysign_v3f16:
1878 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
1879 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
1880 ; VI-NEXT: s_movk_i32 s6, 0x7fff
1881 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1882 ; VI-NEXT: v_mov_b32_e32 v0, s0
1883 ; VI-NEXT: v_mov_b32_e32 v1, s2
1884 ; VI-NEXT: s_lshr_b32 s2, s2, 16
1885 ; VI-NEXT: s_lshr_b32 s0, s0, 16
1886 ; VI-NEXT: v_bfi_b32 v0, s6, v0, v1
1887 ; VI-NEXT: v_mov_b32_e32 v1, s0
1888 ; VI-NEXT: v_mov_b32_e32 v2, s2
1889 ; VI-NEXT: v_bfi_b32 v1, s6, v1, v2
1890 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1891 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1892 ; VI-NEXT: v_mov_b32_e32 v0, s1
1893 ; VI-NEXT: v_mov_b32_e32 v1, s3
1894 ; VI-NEXT: s_add_u32 s0, s4, 4
1895 ; VI-NEXT: v_bfi_b32 v3, s6, v0, v1
1896 ; VI-NEXT: s_addc_u32 s1, s5, 0
1897 ; VI-NEXT: v_mov_b32_e32 v0, s0
1898 ; VI-NEXT: v_mov_b32_e32 v1, s1
1899 ; VI-NEXT: flat_store_short v[0:1], v3
1900 ; VI-NEXT: v_mov_b32_e32 v0, s4
1901 ; VI-NEXT: v_mov_b32_e32 v1, s5
1902 ; VI-NEXT: flat_store_dword v[0:1], v2
1905 ; GFX9-LABEL: s_copysign_v3f16:
1907 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
1908 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
1909 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
1910 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1911 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1912 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1913 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1914 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
1915 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
1916 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
1917 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
1918 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1919 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
1920 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1921 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
1922 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1923 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1924 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
1925 ; GFX9-NEXT: global_store_short v0, v2, s[6:7] offset:4
1926 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1927 ; GFX9-NEXT: s_endpgm
1929 ; GFX11-LABEL: s_copysign_v3f16:
1931 ; GFX11-NEXT: s_clause 0x1
1932 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
1933 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
1934 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1935 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
1936 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
1937 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1938 ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
1939 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
1940 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
1941 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1942 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
1943 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
1944 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1945 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1946 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1947 ; GFX11-NEXT: s_clause 0x1
1948 ; GFX11-NEXT: global_store_b16 v3, v2, s[4:5] offset:4
1949 ; GFX11-NEXT: global_store_b32 v3, v0, s[4:5]
1950 ; GFX11-NEXT: s_endpgm
1951 %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
1952 store <3 x half> %out, ptr addrspace(1) %arg_out
1956 define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) {
1957 ; SI-LABEL: s_copysign_v4f16:
1959 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
1960 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1961 ; SI-NEXT: s_mov_b32 s7, 0xf000
1962 ; SI-NEXT: s_mov_b32 s6, -1
1963 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1964 ; SI-NEXT: s_lshr_b32 s8, s0, 16
1965 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
1966 ; SI-NEXT: s_lshr_b32 s0, s2, 16
1967 ; SI-NEXT: s_lshr_b32 s9, s1, 16
1968 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
1969 ; SI-NEXT: s_lshr_b32 s0, s3, 16
1970 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s8
1971 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s9
1972 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
1973 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
1974 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s2
1975 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s3
1976 ; SI-NEXT: s_brev_b32 s0, -2
1977 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v5
1978 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v4
1979 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1980 ; SI-NEXT: v_bfi_b32 v3, s0, v3, v7
1981 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1982 ; SI-NEXT: v_bfi_b32 v2, s0, v2, v6
1983 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
1984 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1985 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1986 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1987 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
1988 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
1989 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1992 ; VI-LABEL: s_copysign_v4f16:
1994 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
1995 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
1996 ; VI-NEXT: s_movk_i32 s6, 0x7fff
1997 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1998 ; VI-NEXT: v_mov_b32_e32 v0, s1
1999 ; VI-NEXT: v_mov_b32_e32 v1, s3
2000 ; VI-NEXT: s_lshr_b32 s3, s3, 16
2001 ; VI-NEXT: s_lshr_b32 s1, s1, 16
2002 ; VI-NEXT: v_bfi_b32 v0, s6, v0, v1
2003 ; VI-NEXT: v_mov_b32_e32 v1, s1
2004 ; VI-NEXT: v_mov_b32_e32 v2, s3
2005 ; VI-NEXT: v_bfi_b32 v1, s6, v1, v2
2006 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2007 ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2008 ; VI-NEXT: v_mov_b32_e32 v0, s0
2009 ; VI-NEXT: v_mov_b32_e32 v2, s2
2010 ; VI-NEXT: s_lshr_b32 s1, s2, 16
2011 ; VI-NEXT: s_lshr_b32 s0, s0, 16
2012 ; VI-NEXT: v_bfi_b32 v0, s6, v0, v2
2013 ; VI-NEXT: v_mov_b32_e32 v2, s0
2014 ; VI-NEXT: v_mov_b32_e32 v3, s1
2015 ; VI-NEXT: v_bfi_b32 v2, s6, v2, v3
2016 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2017 ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2018 ; VI-NEXT: v_mov_b32_e32 v2, s4
2019 ; VI-NEXT: v_mov_b32_e32 v3, s5
2020 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2023 ; GFX9-LABEL: s_copysign_v4f16:
2025 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
2026 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2027 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
2028 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2029 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2030 ; GFX9-NEXT: v_mov_b32_e32 v0, s1
2031 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2032 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16
2033 ; GFX9-NEXT: s_lshr_b32 s1, s1, 16
2034 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
2035 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2036 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2037 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
2038 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2039 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
2040 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2041 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
2042 ; GFX9-NEXT: s_lshr_b32 s1, s2, 16
2043 ; GFX9-NEXT: s_lshr_b32 s0, s0, 16
2044 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
2045 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
2046 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
2047 ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
2048 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2049 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
2050 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2051 ; GFX9-NEXT: s_endpgm
2053 ; GFX11-LABEL: s_copysign_v4f16:
2055 ; GFX11-NEXT: s_clause 0x1
2056 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
2057 ; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
2058 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2059 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
2060 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2061 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16
2062 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
2063 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2064 ; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
2065 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
2066 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
2067 ; GFX11-NEXT: s_lshr_b32 s6, s1, 16
2068 ; GFX11-NEXT: s_lshr_b32 s0, s0, 16
2069 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
2070 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
2071 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
2072 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
2073 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2074 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
2075 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
2076 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
2077 ; GFX11-NEXT: s_endpgm
2078 %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
2079 store <4 x half> %out, ptr addrspace(1) %arg_out
2083 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }