1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s
7 declare half @llvm.copysign.f16(half, half) #0
8 declare float @llvm.copysign.f32(float, float) #0
9 declare double @llvm.copysign.f64(double, double) #0
10 declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) #0
11 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0
12 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0
13 declare i32 @llvm.amdgcn.workitem.id.x() #0
15 define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) {
16 ; SI-LABEL: s_copysign_f16:
18 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
19 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
20 ; SI-NEXT: s_mov_b32 s3, 0xf000
21 ; SI-NEXT: s_waitcnt lgkmcnt(0)
22 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
23 ; SI-NEXT: s_lshr_b32 s2, s2, 16
24 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
25 ; SI-NEXT: s_brev_b32 s2, -2
26 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
27 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
28 ; SI-NEXT: s_mov_b32 s2, -1
29 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
32 ; VI-LABEL: s_copysign_f16:
34 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
35 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
36 ; VI-NEXT: s_movk_i32 s3, 0x7fff
37 ; VI-NEXT: s_waitcnt lgkmcnt(0)
38 ; VI-NEXT: s_lshr_b32 s4, s2, 16
39 ; VI-NEXT: v_mov_b32_e32 v0, s2
40 ; VI-NEXT: v_mov_b32_e32 v1, s4
41 ; VI-NEXT: v_bfi_b32 v2, s3, v0, v1
42 ; VI-NEXT: v_mov_b32_e32 v0, s0
43 ; VI-NEXT: v_mov_b32_e32 v1, s1
44 ; VI-NEXT: flat_store_short v[0:1], v2
47 ; GFX9-LABEL: s_copysign_f16:
49 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
50 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
51 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
52 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
53 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX9-NEXT: s_lshr_b32 s1, s4, 16
55 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
56 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
57 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
58 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
61 ; GFX11-LABEL: s_copysign_f16:
63 ; GFX11-NEXT: s_clause 0x1
64 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
65 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
66 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
67 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
68 ; GFX11-NEXT: s_lshr_b32 s3, s2, 16
69 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
70 ; GFX11-NEXT: v_mov_b32_e32 v0, s3
71 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
72 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
74 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
75 ; GFX11-NEXT: s_endpgm
76 %out = call half @llvm.copysign.f16(half %mag, half %sign)
77 store half %out, ptr addrspace(1) %arg_out
81 define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) {
82 ; SI-LABEL: s_test_copysign_f16_0:
84 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
85 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
86 ; SI-NEXT: s_mov_b32 s3, 0xf000
87 ; SI-NEXT: s_mov_b32 s2, -1
88 ; SI-NEXT: s_waitcnt lgkmcnt(0)
89 ; SI-NEXT: s_and_b32 s4, s4, 0x7fff
90 ; SI-NEXT: v_mov_b32_e32 v0, s4
91 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
94 ; VI-LABEL: s_test_copysign_f16_0:
96 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
97 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
98 ; VI-NEXT: s_waitcnt lgkmcnt(0)
99 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
100 ; VI-NEXT: v_mov_b32_e32 v0, s0
101 ; VI-NEXT: v_mov_b32_e32 v1, s1
102 ; VI-NEXT: v_mov_b32_e32 v2, s2
103 ; VI-NEXT: flat_store_short v[0:1], v2
106 ; GFX9-LABEL: s_test_copysign_f16_0:
108 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
109 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
110 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
111 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
112 ; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff
113 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
114 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
115 ; GFX9-NEXT: s_endpgm
117 ; GFX11-LABEL: s_test_copysign_f16_0:
119 ; GFX11-NEXT: s_clause 0x1
120 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
121 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
122 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
123 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
124 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
125 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
126 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
127 ; GFX11-NEXT: s_nop 0
128 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
129 ; GFX11-NEXT: s_endpgm
130 %result = call half @llvm.copysign.f16(half %mag, half 0.0)
131 store half %result, ptr addrspace(1) %out, align 4
135 define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) {
136 ; SI-LABEL: s_test_copysign_f16_1:
138 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
139 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
140 ; SI-NEXT: s_mov_b32 s3, 0xf000
141 ; SI-NEXT: s_mov_b32 s2, -1
142 ; SI-NEXT: s_waitcnt lgkmcnt(0)
143 ; SI-NEXT: s_and_b32 s4, s4, 0x7fff
144 ; SI-NEXT: v_mov_b32_e32 v0, s4
145 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
148 ; VI-LABEL: s_test_copysign_f16_1:
150 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
151 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
152 ; VI-NEXT: s_waitcnt lgkmcnt(0)
153 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
154 ; VI-NEXT: v_mov_b32_e32 v0, s0
155 ; VI-NEXT: v_mov_b32_e32 v1, s1
156 ; VI-NEXT: v_mov_b32_e32 v2, s2
157 ; VI-NEXT: flat_store_short v[0:1], v2
160 ; GFX9-LABEL: s_test_copysign_f16_1:
162 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
163 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
164 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
165 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
166 ; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff
167 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
168 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
169 ; GFX9-NEXT: s_endpgm
171 ; GFX11-LABEL: s_test_copysign_f16_1:
173 ; GFX11-NEXT: s_clause 0x1
174 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
175 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
176 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
177 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
178 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
179 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
180 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
181 ; GFX11-NEXT: s_nop 0
182 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
183 ; GFX11-NEXT: s_endpgm
184 %result = call half @llvm.copysign.f16(half %mag, half 1.0)
185 store half %result, ptr addrspace(1) %out, align 4
189 define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) {
190 ; SI-LABEL: s_test_copysign_f16_10.0:
192 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
193 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
194 ; SI-NEXT: s_mov_b32 s3, 0xf000
195 ; SI-NEXT: s_mov_b32 s2, -1
196 ; SI-NEXT: s_waitcnt lgkmcnt(0)
197 ; SI-NEXT: s_and_b32 s4, s4, 0x7fff
198 ; SI-NEXT: v_mov_b32_e32 v0, s4
199 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
202 ; VI-LABEL: s_test_copysign_f16_10.0:
204 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
205 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
206 ; VI-NEXT: s_waitcnt lgkmcnt(0)
207 ; VI-NEXT: s_and_b32 s2, s2, 0x7fff
208 ; VI-NEXT: v_mov_b32_e32 v0, s0
209 ; VI-NEXT: v_mov_b32_e32 v1, s1
210 ; VI-NEXT: v_mov_b32_e32 v2, s2
211 ; VI-NEXT: flat_store_short v[0:1], v2
214 ; GFX9-LABEL: s_test_copysign_f16_10.0:
216 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
217 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
218 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
219 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
220 ; GFX9-NEXT: s_and_b32 s0, s4, 0x7fff
221 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
222 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
223 ; GFX9-NEXT: s_endpgm
225 ; GFX11-LABEL: s_test_copysign_f16_10.0:
227 ; GFX11-NEXT: s_clause 0x1
228 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
229 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
230 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
231 ; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
232 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
233 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
234 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
235 ; GFX11-NEXT: s_nop 0
236 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
237 ; GFX11-NEXT: s_endpgm
238 %result = call half @llvm.copysign.f16(half %mag, half 10.0)
239 store half %result, ptr addrspace(1) %out, align 4
243 define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) {
244 ; SI-LABEL: s_test_copysign_f16_neg1:
246 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
247 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
248 ; SI-NEXT: s_mov_b32 s3, 0xf000
249 ; SI-NEXT: s_mov_b32 s2, -1
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: s_bitset1_b32 s4, 15
252 ; SI-NEXT: v_mov_b32_e32 v0, s4
253 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
256 ; VI-LABEL: s_test_copysign_f16_neg1:
258 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
259 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
260 ; VI-NEXT: s_waitcnt lgkmcnt(0)
261 ; VI-NEXT: s_bitset1_b32 s2, 15
262 ; VI-NEXT: v_mov_b32_e32 v0, s0
263 ; VI-NEXT: v_mov_b32_e32 v1, s1
264 ; VI-NEXT: v_mov_b32_e32 v2, s2
265 ; VI-NEXT: flat_store_short v[0:1], v2
268 ; GFX9-LABEL: s_test_copysign_f16_neg1:
270 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
271 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
272 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
273 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
274 ; GFX9-NEXT: s_or_b32 s0, s4, 0x8000
275 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
276 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
277 ; GFX9-NEXT: s_endpgm
279 ; GFX11-LABEL: s_test_copysign_f16_neg1:
281 ; GFX11-NEXT: s_clause 0x1
282 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
283 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
284 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX11-NEXT: s_bitset1_b32 s2, 15
286 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
287 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
288 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
289 ; GFX11-NEXT: s_nop 0
290 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
291 ; GFX11-NEXT: s_endpgm
292 %result = call half @llvm.copysign.f16(half %mag, half -1.0)
293 store half %result, ptr addrspace(1) %out, align 4
297 define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) {
298 ; SI-LABEL: s_test_copysign_f16_neg10:
300 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
301 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
302 ; SI-NEXT: s_mov_b32 s3, 0xf000
303 ; SI-NEXT: s_mov_b32 s2, -1
304 ; SI-NEXT: s_waitcnt lgkmcnt(0)
305 ; SI-NEXT: s_bitset1_b32 s4, 15
306 ; SI-NEXT: v_mov_b32_e32 v0, s4
307 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
310 ; VI-LABEL: s_test_copysign_f16_neg10:
312 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
313 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
314 ; VI-NEXT: s_waitcnt lgkmcnt(0)
315 ; VI-NEXT: s_bitset1_b32 s2, 15
316 ; VI-NEXT: v_mov_b32_e32 v0, s0
317 ; VI-NEXT: v_mov_b32_e32 v1, s1
318 ; VI-NEXT: v_mov_b32_e32 v2, s2
319 ; VI-NEXT: flat_store_short v[0:1], v2
322 ; GFX9-LABEL: s_test_copysign_f16_neg10:
324 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
325 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
326 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
327 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
328 ; GFX9-NEXT: s_or_b32 s0, s4, 0x8000
329 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
330 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
331 ; GFX9-NEXT: s_endpgm
333 ; GFX11-LABEL: s_test_copysign_f16_neg10:
335 ; GFX11-NEXT: s_clause 0x1
336 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
337 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
338 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX11-NEXT: s_bitset1_b32 s2, 15
340 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
341 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
342 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
343 ; GFX11-NEXT: s_nop 0
344 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
345 ; GFX11-NEXT: s_endpgm
346 %result = call half @llvm.copysign.f16(half %mag, half -10.0)
347 store half %result, ptr addrspace(1) %out, align 4
351 define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) {
352 ; SI-LABEL: s_test_copysign_f16_0_mag:
354 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
355 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
356 ; SI-NEXT: s_mov_b32 s3, 0xf000
357 ; SI-NEXT: s_waitcnt lgkmcnt(0)
358 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
359 ; SI-NEXT: s_brev_b32 s2, -2
360 ; SI-NEXT: v_bfi_b32 v0, s2, 0, v0
361 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
362 ; SI-NEXT: s_mov_b32 s2, -1
363 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
366 ; VI-LABEL: s_test_copysign_f16_0_mag:
368 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
369 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
370 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
371 ; VI-NEXT: s_waitcnt lgkmcnt(0)
372 ; VI-NEXT: v_and_b32_e32 v2, s2, v0
373 ; VI-NEXT: v_mov_b32_e32 v0, s0
374 ; VI-NEXT: v_mov_b32_e32 v1, s1
375 ; VI-NEXT: flat_store_short v[0:1], v2
378 ; GFX9-LABEL: s_test_copysign_f16_0_mag:
380 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
381 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
382 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
383 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
384 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
386 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
387 ; GFX9-NEXT: s_endpgm
389 ; GFX11-LABEL: s_test_copysign_f16_0_mag:
391 ; GFX11-NEXT: s_clause 0x1
392 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
393 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
394 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
395 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
396 ; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2
397 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
398 ; GFX11-NEXT: s_nop 0
399 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
400 ; GFX11-NEXT: s_endpgm
401 %result = call half @llvm.copysign.f16(half 0.0, half %sign)
402 store half %result, ptr addrspace(1) %out, align 4
407 define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) {
408 ; SI-LABEL: s_test_copysign_f16_1_mag:
410 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
411 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
412 ; SI-NEXT: s_mov_b32 s3, 0xf000
413 ; SI-NEXT: s_waitcnt lgkmcnt(0)
414 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
415 ; SI-NEXT: s_brev_b32 s2, -2
416 ; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0
417 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
418 ; SI-NEXT: s_mov_b32 s2, -1
419 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
422 ; VI-LABEL: s_test_copysign_f16_1_mag:
424 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
425 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
426 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
427 ; VI-NEXT: s_waitcnt lgkmcnt(0)
428 ; VI-NEXT: v_and_b32_e32 v0, s2, v0
429 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
430 ; VI-NEXT: v_mov_b32_e32 v0, s0
431 ; VI-NEXT: v_mov_b32_e32 v1, s1
432 ; VI-NEXT: flat_store_short v[0:1], v2
435 ; GFX9-LABEL: s_test_copysign_f16_1_mag:
437 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
438 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
439 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
440 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
441 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
442 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
443 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1
444 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
445 ; GFX9-NEXT: s_endpgm
447 ; GFX11-LABEL: s_test_copysign_f16_1_mag:
449 ; GFX11-NEXT: s_clause 0x1
450 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
451 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
452 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
453 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
455 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
456 ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
457 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
458 ; GFX11-NEXT: s_nop 0
459 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
460 ; GFX11-NEXT: s_endpgm
461 %result = call half @llvm.copysign.f16(half 1.0, half %sign)
462 store half %result, ptr addrspace(1) %out, align 4
466 define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) {
467 ; SI-LABEL: s_test_copysign_f16_10_mag:
469 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
470 ; SI-NEXT: v_mov_b32_e32 v1, 0x41200000
471 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
472 ; SI-NEXT: s_mov_b32 s3, 0xf000
473 ; SI-NEXT: s_waitcnt lgkmcnt(0)
474 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
475 ; SI-NEXT: s_brev_b32 s2, -2
476 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
477 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
478 ; SI-NEXT: s_mov_b32 s2, -1
479 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
482 ; VI-LABEL: s_test_copysign_f16_10_mag:
484 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
485 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
486 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
487 ; VI-NEXT: s_waitcnt lgkmcnt(0)
488 ; VI-NEXT: v_and_b32_e32 v0, s2, v0
489 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
490 ; VI-NEXT: v_mov_b32_e32 v0, s0
491 ; VI-NEXT: v_mov_b32_e32 v1, s1
492 ; VI-NEXT: flat_store_short v[0:1], v2
495 ; GFX9-LABEL: s_test_copysign_f16_10_mag:
497 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
498 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
499 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
500 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
501 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
502 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
503 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1
504 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
505 ; GFX9-NEXT: s_endpgm
507 ; GFX11-LABEL: s_test_copysign_f16_10_mag:
509 ; GFX11-NEXT: s_clause 0x1
510 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
511 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
512 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
513 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
515 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
516 ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
517 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
518 ; GFX11-NEXT: s_nop 0
519 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
520 ; GFX11-NEXT: s_endpgm
521 %result = call half @llvm.copysign.f16(half 10.0, half %sign)
522 store half %result, ptr addrspace(1) %out, align 4
526 define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) {
527 ; SI-LABEL: s_test_copysign_f16_neg1_mag:
529 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
530 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
531 ; SI-NEXT: s_mov_b32 s3, 0xf000
532 ; SI-NEXT: s_waitcnt lgkmcnt(0)
533 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
534 ; SI-NEXT: s_brev_b32 s2, -2
535 ; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0
536 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
537 ; SI-NEXT: s_mov_b32 s2, -1
538 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
541 ; VI-LABEL: s_test_copysign_f16_neg1_mag:
543 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
544 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
545 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
546 ; VI-NEXT: s_waitcnt lgkmcnt(0)
547 ; VI-NEXT: v_and_b32_e32 v0, s2, v0
548 ; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
549 ; VI-NEXT: v_mov_b32_e32 v0, s0
550 ; VI-NEXT: v_mov_b32_e32 v1, s1
551 ; VI-NEXT: flat_store_short v[0:1], v2
554 ; GFX9-LABEL: s_test_copysign_f16_neg1_mag:
556 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
557 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
558 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
559 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
560 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
561 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
562 ; GFX9-NEXT: v_or_b32_e32 v1, 0x3c00, v1
563 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
564 ; GFX9-NEXT: s_endpgm
566 ; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
568 ; GFX11-NEXT: s_clause 0x1
569 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
570 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
571 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
572 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
573 ; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
574 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
575 ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
576 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
577 ; GFX11-NEXT: s_nop 0
578 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
579 ; GFX11-NEXT: s_endpgm
580 %result = call half @llvm.copysign.f16(half -1.0, half %sign)
581 store half %result, ptr addrspace(1) %out, align 4
585 define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) {
586 ; SI-LABEL: s_test_copysign_f16_neg10_mag:
588 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
589 ; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000
590 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
591 ; SI-NEXT: s_mov_b32 s3, 0xf000
592 ; SI-NEXT: s_waitcnt lgkmcnt(0)
593 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
594 ; SI-NEXT: s_brev_b32 s2, -2
595 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
596 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
597 ; SI-NEXT: s_mov_b32 s2, -1
598 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
601 ; VI-LABEL: s_test_copysign_f16_neg10_mag:
603 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
604 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
605 ; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
606 ; VI-NEXT: s_waitcnt lgkmcnt(0)
607 ; VI-NEXT: v_and_b32_e32 v0, s2, v0
608 ; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
609 ; VI-NEXT: v_mov_b32_e32 v0, s0
610 ; VI-NEXT: v_mov_b32_e32 v1, s1
611 ; VI-NEXT: flat_store_short v[0:1], v2
614 ; GFX9-LABEL: s_test_copysign_f16_neg10_mag:
616 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
617 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
618 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
619 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
620 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
621 ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
622 ; GFX9-NEXT: v_or_b32_e32 v1, 0x4900, v1
623 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
624 ; GFX9-NEXT: s_endpgm
626 ; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
628 ; GFX11-NEXT: s_clause 0x1
629 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
630 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
631 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
632 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
634 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
635 ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
636 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
637 ; GFX11-NEXT: s_nop 0
638 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
639 ; GFX11-NEXT: s_endpgm
640 %result = call half @llvm.copysign.f16(half -10.0, half %sign)
641 store half %result, ptr addrspace(1) %out, align 4
645 define half @v_copysign_f16(half %mag, half %sign) {
646 ; SI-LABEL: v_copysign_f16:
648 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
649 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
650 ; SI-NEXT: s_brev_b32 s4, -2
651 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
652 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
653 ; SI-NEXT: s_setpc_b64 s[30:31]
655 ; VI-LABEL: v_copysign_f16:
657 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
658 ; VI-NEXT: s_movk_i32 s4, 0x7fff
659 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
660 ; VI-NEXT: s_setpc_b64 s[30:31]
662 ; GFX9-LABEL: v_copysign_f16:
664 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
666 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
667 ; GFX9-NEXT: s_setpc_b64 s[30:31]
669 ; GFX11-LABEL: v_copysign_f16:
671 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
672 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
673 ; GFX11-NEXT: s_setpc_b64 s[30:31]
674 %result = call half @llvm.copysign.f16(half %mag, half %sign)
678 define half @v_test_copysign_f16_0(half %mag) {
679 ; SI-LABEL: v_test_copysign_f16_0:
681 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
682 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
683 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
684 ; SI-NEXT: s_setpc_b64 s[30:31]
686 ; VI-LABEL: v_test_copysign_f16_0:
688 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
690 ; VI-NEXT: s_setpc_b64 s[30:31]
692 ; GFX9-LABEL: v_test_copysign_f16_0:
694 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
695 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
696 ; GFX9-NEXT: s_setpc_b64 s[30:31]
698 ; GFX11-LABEL: v_test_copysign_f16_0:
700 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
701 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
702 ; GFX11-NEXT: s_setpc_b64 s[30:31]
703 %result = call half @llvm.copysign.f16(half %mag, half 0.0)
707 define half @v_test_copysign_f16_1(half %mag) {
708 ; SI-LABEL: v_test_copysign_f16_1:
710 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
711 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
712 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
713 ; SI-NEXT: s_setpc_b64 s[30:31]
715 ; VI-LABEL: v_test_copysign_f16_1:
717 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
718 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
719 ; VI-NEXT: s_setpc_b64 s[30:31]
721 ; GFX9-LABEL: v_test_copysign_f16_1:
723 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
724 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
725 ; GFX9-NEXT: s_setpc_b64 s[30:31]
727 ; GFX11-LABEL: v_test_copysign_f16_1:
729 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
731 ; GFX11-NEXT: s_setpc_b64 s[30:31]
732 %result = call half @llvm.copysign.f16(half %mag, half 1.0)
736 define half @v_test_copysign_f16_10(half %mag) {
737 ; SI-LABEL: v_test_copysign_f16_10:
739 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
741 ; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
742 ; SI-NEXT: s_setpc_b64 s[30:31]
744 ; VI-LABEL: v_test_copysign_f16_10:
746 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
747 ; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
748 ; VI-NEXT: s_setpc_b64 s[30:31]
750 ; GFX9-LABEL: v_test_copysign_f16_10:
752 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
753 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
754 ; GFX9-NEXT: s_setpc_b64 s[30:31]
756 ; GFX11-LABEL: v_test_copysign_f16_10:
758 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
759 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
760 ; GFX11-NEXT: s_setpc_b64 s[30:31]
761 %result = call half @llvm.copysign.f16(half %mag, half 10.0)
765 define half @v_test_copysign_f16_neg1(half %mag) {
766 ; SI-LABEL: v_test_copysign_f16_neg1:
768 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
770 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
771 ; SI-NEXT: s_setpc_b64 s[30:31]
773 ; VI-LABEL: v_test_copysign_f16_neg1:
775 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
777 ; VI-NEXT: s_setpc_b64 s[30:31]
779 ; GFX9-LABEL: v_test_copysign_f16_neg1:
781 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
782 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
783 ; GFX9-NEXT: s_setpc_b64 s[30:31]
785 ; GFX11-LABEL: v_test_copysign_f16_neg1:
787 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
788 ; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
789 ; GFX11-NEXT: s_setpc_b64 s[30:31]
790 %result = call half @llvm.copysign.f16(half %mag, half -1.0)
794 define half @v_test_copysign_f16_neg10(half %mag) {
795 ; SI-LABEL: v_test_copysign_f16_neg10:
797 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
799 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
800 ; SI-NEXT: s_setpc_b64 s[30:31]
802 ; VI-LABEL: v_test_copysign_f16_neg10:
804 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805 ; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
806 ; VI-NEXT: s_setpc_b64 s[30:31]
808 ; GFX9-LABEL: v_test_copysign_f16_neg10:
810 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811 ; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
812 ; GFX9-NEXT: s_setpc_b64 s[30:31]
814 ; GFX11-LABEL: v_test_copysign_f16_neg10:
816 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
817 ; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
818 ; GFX11-NEXT: s_setpc_b64 s[30:31]
819 %result = call half @llvm.copysign.f16(half %mag, half -10.0)
823 define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
824 ; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
826 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
827 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
828 ; SI-NEXT: s_mov_b32 s11, 0xf000
829 ; SI-NEXT: s_mov_b32 s14, 0
830 ; SI-NEXT: s_mov_b32 s15, s11
831 ; SI-NEXT: s_waitcnt lgkmcnt(0)
832 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
833 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
834 ; SI-NEXT: v_mov_b32_e32 v2, 0
835 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
836 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
837 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
838 ; SI-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
839 ; SI-NEXT: s_brev_b32 s0, -2
840 ; SI-NEXT: s_mov_b32 s10, -1
841 ; SI-NEXT: s_mov_b32 s8, s4
842 ; SI-NEXT: s_mov_b32 s9, s5
843 ; SI-NEXT: s_waitcnt vmcnt(1)
844 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
845 ; SI-NEXT: s_waitcnt vmcnt(0)
846 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
847 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
850 ; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
852 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
853 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
854 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
855 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
856 ; VI-NEXT: s_waitcnt lgkmcnt(0)
857 ; VI-NEXT: v_mov_b32_e32 v2, s7
858 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
859 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
860 ; VI-NEXT: flat_load_ushort v2, v[1:2]
861 ; VI-NEXT: v_mov_b32_e32 v1, s1
862 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
863 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
864 ; VI-NEXT: flat_load_dword v3, v[0:1]
865 ; VI-NEXT: s_brev_b32 s0, -2
866 ; VI-NEXT: v_mov_b32_e32 v0, s4
867 ; VI-NEXT: v_mov_b32_e32 v1, s5
868 ; VI-NEXT: s_waitcnt vmcnt(1)
869 ; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
870 ; VI-NEXT: s_waitcnt vmcnt(0)
871 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
872 ; VI-NEXT: flat_store_dword v[0:1], v2
875 ; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
877 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
878 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
879 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
880 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
881 ; GFX9-NEXT: s_brev_b32 s0, -2
882 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
883 ; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
884 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
885 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
886 ; GFX9-NEXT: s_waitcnt vmcnt(1)
887 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
888 ; GFX9-NEXT: s_waitcnt vmcnt(0)
889 ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0
890 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
891 ; GFX9-NEXT: s_endpgm
893 ; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
895 ; GFX11-NEXT: s_clause 0x1
896 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
897 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
898 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
899 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
900 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
901 ; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
902 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
903 ; GFX11-NEXT: s_waitcnt vmcnt(1)
904 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
905 ; GFX11-NEXT: s_waitcnt vmcnt(0)
906 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
907 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
908 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
909 ; GFX11-NEXT: s_nop 0
910 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
911 ; GFX11-NEXT: s_endpgm
912 %tid = call i32 @llvm.amdgcn.workitem.id.x()
913 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
914 %mag = load half, ptr addrspace(1) %arg_mag_gep
915 %mag.ext = fpext half %mag to float
916 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
917 %sign = load float, ptr addrspace(1) %arg_sign_gep
918 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
919 store float %out, ptr addrspace(1) %arg_out
923 define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
924 ; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
926 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
927 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
928 ; SI-NEXT: s_mov_b32 s11, 0xf000
929 ; SI-NEXT: s_mov_b32 s14, 0
930 ; SI-NEXT: s_mov_b32 s15, s11
931 ; SI-NEXT: s_waitcnt lgkmcnt(0)
932 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
933 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
934 ; SI-NEXT: v_mov_b32_e32 v2, 0
935 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
936 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
937 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
938 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[1:2], s[0:3], 0 addr64
939 ; SI-NEXT: s_brev_b32 s0, -2
940 ; SI-NEXT: s_mov_b32 s10, -1
941 ; SI-NEXT: s_mov_b32 s8, s4
942 ; SI-NEXT: s_mov_b32 s9, s5
943 ; SI-NEXT: s_waitcnt vmcnt(0)
944 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v3
945 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
946 ; SI-NEXT: v_bfi_b32 v3, s0, v3, v1
947 ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
950 ; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
952 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
953 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
954 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
955 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
956 ; VI-NEXT: s_waitcnt lgkmcnt(0)
957 ; VI-NEXT: v_mov_b32_e32 v2, s7
958 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
959 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
960 ; VI-NEXT: flat_load_ushort v2, v[1:2]
961 ; VI-NEXT: v_mov_b32_e32 v1, s1
962 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
963 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
964 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
965 ; VI-NEXT: s_brev_b32 s0, -2
966 ; VI-NEXT: v_mov_b32_e32 v4, s4
967 ; VI-NEXT: v_mov_b32_e32 v5, s5
968 ; VI-NEXT: s_waitcnt vmcnt(0)
969 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v2
970 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
971 ; VI-NEXT: v_bfi_b32 v3, s0, v3, v1
972 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
975 ; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
977 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
978 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
979 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
980 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
981 ; GFX9-NEXT: s_brev_b32 s0, -2
982 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
983 ; GFX9-NEXT: global_load_ushort v2, v1, s[6:7]
985 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
986 ; GFX9-NEXT: s_waitcnt vmcnt(0)
987 ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2
988 ; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
989 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
990 ; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v1
991 ; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
992 ; GFX9-NEXT: s_endpgm
994 ; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
996 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
997 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
998 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
999 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1001 ; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
1002 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
1003 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1004 ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
1005 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1006 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
1007 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1008 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1
1009 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
1010 ; GFX11-NEXT: s_nop 0
1011 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1012 ; GFX11-NEXT: s_endpgm
1013 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1014 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1015 %mag = load half, ptr addrspace(1) %arg_mag_gep
1016 %mag.ext = fpext half %mag to double
1017 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1018 %sign = load double, ptr addrspace(1) %arg_sign_gep
1019 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
1020 store double %out, ptr addrspace(1) %arg_out
1024 define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1025 ; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1027 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1028 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1029 ; SI-NEXT: s_mov_b32 s11, 0xf000
1030 ; SI-NEXT: s_mov_b32 s14, 0
1031 ; SI-NEXT: s_mov_b32 s15, s11
1032 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1033 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1034 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1035 ; SI-NEXT: v_mov_b32_e32 v2, 0
1036 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1037 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
1038 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1039 ; SI-NEXT: buffer_load_ushort v0, v[1:2], s[0:3], 0 addr64
1040 ; SI-NEXT: s_brev_b32 s0, -2
1041 ; SI-NEXT: s_mov_b32 s10, -1
1042 ; SI-NEXT: s_mov_b32 s8, s4
1043 ; SI-NEXT: s_mov_b32 s9, s5
1044 ; SI-NEXT: s_waitcnt vmcnt(0)
1045 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1046 ; SI-NEXT: v_bfi_b32 v0, s0, v3, v0
1047 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
1050 ; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1052 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1053 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1054 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1055 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1056 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1057 ; VI-NEXT: v_mov_b32_e32 v3, s7
1058 ; VI-NEXT: v_mov_b32_e32 v1, s1
1059 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1060 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1061 ; VI-NEXT: flat_load_ushort v4, v[0:1]
1062 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1063 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1064 ; VI-NEXT: flat_load_dword v2, v[0:1]
1065 ; VI-NEXT: s_brev_b32 s0, -2
1066 ; VI-NEXT: v_mov_b32_e32 v0, s4
1067 ; VI-NEXT: v_mov_b32_e32 v1, s5
1068 ; VI-NEXT: s_waitcnt vmcnt(1)
1069 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
1070 ; VI-NEXT: s_waitcnt vmcnt(0)
1071 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1072 ; VI-NEXT: flat_store_dword v[0:1], v2
1075 ; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1077 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1078 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1079 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1080 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1081 ; GFX9-NEXT: s_brev_b32 s0, -2
1082 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1083 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
1084 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1085 ; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
1086 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1087 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1089 ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
1090 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
1091 ; GFX9-NEXT: s_endpgm
1093 ; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
1095 ; GFX11-NEXT: s_clause 0x1
1096 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1097 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1098 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
1099 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1100 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1101 ; GFX11-NEXT: global_load_u16 v1, v1, s[4:5]
1102 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1103 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1104 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1105 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1106 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1107 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
1108 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
1109 ; GFX11-NEXT: s_nop 0
1110 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1111 ; GFX11-NEXT: s_endpgm
1112 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1113 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1114 %mag = load float, ptr addrspace(1) %arg_mag_gep
1115 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1116 %sign = load half, ptr addrspace(1) %arg_sign_gep
1117 %sign.ext = fpext half %sign to float
1118 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
1119 store float %out, ptr addrspace(1) %arg_out
1123 define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1124 ; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1126 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1127 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1128 ; SI-NEXT: s_mov_b32 s11, 0xf000
1129 ; SI-NEXT: s_mov_b32 s14, 0
1130 ; SI-NEXT: s_mov_b32 s15, s11
1131 ; SI-NEXT: v_mov_b32_e32 v1, 0
1132 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
1133 ; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1134 ; SI-NEXT: v_mov_b32_e32 v3, v1
1135 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1136 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[0:3], 0 addr64
1137 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1138 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1139 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64
1140 ; SI-NEXT: s_brev_b32 s0, -2
1141 ; SI-NEXT: s_mov_b32 s10, -1
1142 ; SI-NEXT: s_mov_b32 s8, s4
1143 ; SI-NEXT: s_mov_b32 s9, s5
1144 ; SI-NEXT: s_waitcnt vmcnt(1)
1145 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1146 ; SI-NEXT: s_waitcnt vmcnt(0)
1147 ; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
1148 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1151 ; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1153 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1154 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1155 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1156 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1157 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1158 ; VI-NEXT: v_mov_b32_e32 v3, s7
1159 ; VI-NEXT: v_mov_b32_e32 v1, s1
1160 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1161 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1162 ; VI-NEXT: flat_load_ushort v4, v[0:1]
1163 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1164 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1165 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1166 ; VI-NEXT: s_brev_b32 s0, -2
1167 ; VI-NEXT: v_mov_b32_e32 v2, s4
1168 ; VI-NEXT: v_mov_b32_e32 v3, s5
1169 ; VI-NEXT: s_waitcnt vmcnt(1)
1170 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
1171 ; VI-NEXT: s_waitcnt vmcnt(0)
1172 ; VI-NEXT: v_bfi_b32 v1, s0, v1, v4
1173 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1176 ; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1178 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1179 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1180 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1181 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1182 ; GFX9-NEXT: s_brev_b32 s0, -2
1183 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1184 ; GFX9-NEXT: global_load_ushort v2, v1, s[2:3]
1185 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1186 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
1187 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1188 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1190 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
1191 ; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
1192 ; GFX9-NEXT: s_endpgm
1194 ; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
1196 ; GFX11-NEXT: s_clause 0x1
1197 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1198 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1199 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1200 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0
1201 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1202 ; GFX11-NEXT: global_load_u16 v2, v1, s[4:5]
1203 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
1204 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1205 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1206 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1208 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
1209 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
1210 ; GFX11-NEXT: s_nop 0
1211 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1212 ; GFX11-NEXT: s_endpgm
1213 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1214 %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
1215 %mag = load double, ptr addrspace(1) %arg_mag_gep
1216 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1217 %sign = load half, ptr addrspace(1) %arg_sign_gep
1218 %sign.ext = fpext half %sign to double
1219 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
1220 store double %out, ptr addrspace(1) %arg_out
1224 define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1225 ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1227 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1228 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1229 ; SI-NEXT: s_mov_b32 s11, 0xf000
1230 ; SI-NEXT: s_mov_b32 s14, 0
1231 ; SI-NEXT: s_mov_b32 s15, s11
1232 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1233 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1234 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1235 ; SI-NEXT: v_mov_b32_e32 v2, 0
1236 ; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64
1237 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
1238 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1239 ; SI-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
1240 ; SI-NEXT: s_brev_b32 s0, -2
1241 ; SI-NEXT: s_mov_b32 s10, -1
1242 ; SI-NEXT: s_mov_b32 s8, s4
1243 ; SI-NEXT: s_mov_b32 s9, s5
1244 ; SI-NEXT: s_waitcnt vmcnt(1)
1245 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
1246 ; SI-NEXT: s_waitcnt vmcnt(0)
1247 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
1248 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1249 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
1252 ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1254 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1255 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1256 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
1257 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1258 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1259 ; VI-NEXT: v_mov_b32_e32 v3, s7
1260 ; VI-NEXT: v_mov_b32_e32 v1, s1
1261 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1262 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1263 ; VI-NEXT: flat_load_dword v4, v[0:1]
1264 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
1265 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
1266 ; VI-NEXT: flat_load_ushort v2, v[0:1]
1267 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1268 ; VI-NEXT: v_mov_b32_e32 v0, s4
1269 ; VI-NEXT: v_mov_b32_e32 v1, s5
1270 ; VI-NEXT: s_waitcnt vmcnt(1)
1271 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
1272 ; VI-NEXT: s_waitcnt vmcnt(0)
1273 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1274 ; VI-NEXT: flat_store_short v[0:1], v2
1277 ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1279 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1280 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1281 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1282 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1283 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1284 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1285 ; GFX9-NEXT: global_load_dword v1, v1, s[2:3]
1286 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1287 ; GFX9-NEXT: global_load_ushort v0, v0, s[6:7]
1288 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1289 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1291 ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
1292 ; GFX9-NEXT: global_store_short v2, v0, s[4:5]
1293 ; GFX9-NEXT: s_endpgm
1295 ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
1297 ; GFX11-NEXT: s_clause 0x1
1298 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1299 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1300 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1301 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1302 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX11-NEXT: global_load_b32 v1, v1, s[4:5]
1304 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1305 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1306 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1307 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1308 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1309 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
1310 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1311 ; GFX11-NEXT: s_nop 0
1312 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1313 ; GFX11-NEXT: s_endpgm
1314 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1315 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1316 %mag = load half, ptr addrspace(1) %arg_mag_gep
1317 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid
1318 %sign = load float, ptr addrspace(1) %arg_sign_gep
1319 %sign.trunc = fptrunc float %sign to half
1320 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1321 store half %out, ptr addrspace(1) %arg_out
1325 define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1326 ; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1328 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1329 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1330 ; SI-NEXT: s_mov_b32 s3, 0xf000
1331 ; SI-NEXT: s_mov_b32 s2, -1
1332 ; SI-NEXT: s_mov_b32 s14, s2
1333 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1334 ; SI-NEXT: s_mov_b32 s12, s6
1335 ; SI-NEXT: s_mov_b32 s13, s7
1336 ; SI-NEXT: s_mov_b32 s15, s3
1337 ; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
1338 ; SI-NEXT: s_mov_b32 s10, 0
1339 ; SI-NEXT: s_mov_b32 s11, s3
1340 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1341 ; SI-NEXT: v_mov_b32_e32 v1, 0
1342 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
1343 ; SI-NEXT: s_brev_b32 s0, -2
1344 ; SI-NEXT: s_mov_b32 s1, s5
1345 ; SI-NEXT: s_waitcnt vmcnt(0)
1346 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
1347 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
1348 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1349 ; SI-NEXT: s_mov_b32 s0, s4
1350 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1353 ; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1355 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1356 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1357 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
1358 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1359 ; VI-NEXT: v_mov_b32_e32 v0, s6
1360 ; VI-NEXT: v_mov_b32_e32 v2, s1
1361 ; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
1362 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1363 ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
1364 ; VI-NEXT: s_waitcnt vmcnt(0)
1365 ; VI-NEXT: v_mov_b32_e32 v1, s7
1366 ; VI-NEXT: flat_load_ushort v3, v[0:1]
1367 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1368 ; VI-NEXT: v_mov_b32_e32 v0, s4
1369 ; VI-NEXT: v_mov_b32_e32 v1, s5
1370 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
1371 ; VI-NEXT: s_waitcnt vmcnt(0)
1372 ; VI-NEXT: v_bfi_b32 v2, s0, v3, v2
1373 ; VI-NEXT: flat_store_short v[0:1], v2
1376 ; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1378 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1379 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1380 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1381 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1382 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1383 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
1384 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1385 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1386 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
1387 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1388 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1389 ; GFX9-NEXT: v_bfi_b32 v1, s0, v2, v1
1390 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
1391 ; GFX9-NEXT: s_endpgm
1393 ; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
1395 ; GFX11-NEXT: s_clause 0x1
1396 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
1397 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1398 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1399 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1400 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1401 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
1402 ; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
1403 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1404 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
1405 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1406 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1407 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
1408 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1409 ; GFX11-NEXT: s_nop 0
1410 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1411 ; GFX11-NEXT: s_endpgm
1412 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1413 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
1414 %mag = load half, ptr addrspace(1) %arg_mag
1415 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid
1416 %sign = load double, ptr addrspace(1) %arg_sign_gep
1417 %sign.trunc = fptrunc double %sign to half
1418 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
1419 store half %out, ptr addrspace(1) %arg_out
1423 define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) {
1424 ; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1426 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1427 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1428 ; SI-NEXT: s_mov_b32 s11, 0xf000
1429 ; SI-NEXT: s_mov_b32 s14, 0
1430 ; SI-NEXT: s_mov_b32 s15, s11
1431 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1432 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1433 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1434 ; SI-NEXT: v_mov_b32_e32 v2, 0
1435 ; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64
1436 ; SI-NEXT: s_mov_b64 s[2:3], s[14:15]
1437 ; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
1438 ; SI-NEXT: buffer_load_ushort v0, v[1:2], s[0:3], 0 addr64
1439 ; SI-NEXT: s_brev_b32 s0, -2
1440 ; SI-NEXT: s_mov_b32 s10, -1
1441 ; SI-NEXT: s_mov_b32 s8, s4
1442 ; SI-NEXT: s_mov_b32 s9, s5
1443 ; SI-NEXT: s_waitcnt vmcnt(1)
1444 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
1445 ; SI-NEXT: s_waitcnt vmcnt(0)
1446 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1447 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1448 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
1449 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1450 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
1453 ; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1455 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1456 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1457 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1458 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1459 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1460 ; VI-NEXT: v_mov_b32_e32 v2, s7
1461 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1462 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1463 ; VI-NEXT: flat_load_dword v2, v[1:2]
1464 ; VI-NEXT: v_mov_b32_e32 v1, s1
1465 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
1466 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1467 ; VI-NEXT: flat_load_ushort v3, v[0:1]
1468 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1469 ; VI-NEXT: v_mov_b32_e32 v0, s4
1470 ; VI-NEXT: v_mov_b32_e32 v1, s5
1471 ; VI-NEXT: s_waitcnt vmcnt(1)
1472 ; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
1473 ; VI-NEXT: s_waitcnt vmcnt(0)
1474 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1475 ; VI-NEXT: flat_store_short v[0:1], v2
1478 ; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1480 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1481 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1482 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1483 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1484 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1485 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX9-NEXT: global_load_dword v1, v1, s[6:7]
1487 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1488 ; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
1489 ; GFX9-NEXT: s_waitcnt vmcnt(1)
1490 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
1491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1492 ; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0
1493 ; GFX9-NEXT: global_store_short v2, v0, s[4:5]
1494 ; GFX9-NEXT: s_endpgm
1496 ; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
1498 ; GFX11-NEXT: s_clause 0x1
1499 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1500 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1501 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
1502 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1503 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1504 ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
1505 ; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
1506 ; GFX11-NEXT: s_waitcnt vmcnt(1)
1507 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
1508 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1509 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1510 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
1511 ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
1512 ; GFX11-NEXT: s_nop 0
1513 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1514 ; GFX11-NEXT: s_endpgm
1515 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1516 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
1517 %mag = load float, ptr addrspace(1) %arg_mag_gep
1518 %mag.trunc = fptrunc float %mag to half
1519 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid
1520 %sign = load half, ptr addrspace(1) %arg_sign_gep
1521 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1522 store half %out, ptr addrspace(1) %arg_out
1526 define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) {
1527 ; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1529 ; SI-NEXT: s_load_dword s4, s[0:1], 0xd
1530 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1531 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1532 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
1533 ; SI-NEXT: s_lshr_b32 s4, s3, 8
1534 ; SI-NEXT: s_and_b32 s5, s3, 0x1ff
1535 ; SI-NEXT: s_and_b32 s6, s4, 0xffe
1536 ; SI-NEXT: s_or_b32 s2, s5, s2
1537 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1538 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1539 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1540 ; SI-NEXT: v_readfirstlane_b32 s2, v1
1541 ; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014
1542 ; SI-NEXT: s_or_b32 s2, s6, s2
1543 ; SI-NEXT: s_sub_i32 s6, 0x3f1, s5
1544 ; SI-NEXT: v_med3_i32 v1, s6, 0, 13
1545 ; SI-NEXT: s_or_b32 s4, s2, 0x1000
1546 ; SI-NEXT: v_readfirstlane_b32 s6, v1
1547 ; SI-NEXT: s_lshr_b32 s6, s4, s6
1548 ; SI-NEXT: v_lshl_b32_e32 v1, s6, v1
1549 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1
1550 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
1551 ; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10
1552 ; SI-NEXT: v_readfirstlane_b32 s4, v1
1553 ; SI-NEXT: s_lshl_b32 s5, s8, 12
1554 ; SI-NEXT: s_or_b32 s4, s6, s4
1555 ; SI-NEXT: s_or_b32 s5, s2, s5
1556 ; SI-NEXT: s_cmp_lt_i32 s8, 1
1557 ; SI-NEXT: s_cselect_b32 s9, s4, s5
1558 ; SI-NEXT: s_and_b32 s6, s9, 7
1559 ; SI-NEXT: s_cmp_gt_i32 s6, 5
1560 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1561 ; SI-NEXT: s_cmp_eq_u32 s6, 3
1562 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
1563 ; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
1564 ; SI-NEXT: s_lshr_b32 s6, s9, 2
1565 ; SI-NEXT: s_or_b32 s4, s4, s5
1566 ; SI-NEXT: s_cmp_lg_u32 s4, 0
1567 ; SI-NEXT: s_addc_u32 s4, s6, 0
1568 ; SI-NEXT: s_cmp_lt_i32 s8, 31
1569 ; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00
1570 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1571 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
1572 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
1573 ; SI-NEXT: v_lshlrev_b32_e32 v1, 9, v1
1574 ; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f
1575 ; SI-NEXT: v_or_b32_e32 v1, 0x7c00, v1
1576 ; SI-NEXT: v_mov_b32_e32 v2, s6
1577 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1578 ; SI-NEXT: s_lshr_b32 s2, s3, 16
1579 ; SI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
1580 ; SI-NEXT: s_and_b32 s2, s2, 0x8000
1581 ; SI-NEXT: v_or_b32_e32 v1, s2, v1
1582 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1583 ; SI-NEXT: s_brev_b32 s2, -2
1584 ; SI-NEXT: s_mov_b32 s3, 0xf000
1585 ; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
1586 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1587 ; SI-NEXT: s_mov_b32 s2, -1
1588 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1591 ; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1593 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1594 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1595 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1596 ; VI-NEXT: s_lshr_b32 s0, s7, 8
1597 ; VI-NEXT: s_and_b32 s1, s7, 0x1ff
1598 ; VI-NEXT: s_and_b32 s2, s0, 0xffe
1599 ; VI-NEXT: s_or_b32 s0, s1, s6
1600 ; VI-NEXT: s_cmp_lg_u32 s0, 0
1601 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1602 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1603 ; VI-NEXT: v_readfirstlane_b32 s0, v2
1604 ; VI-NEXT: s_bfe_u32 s1, s7, 0xb0014
1605 ; VI-NEXT: v_mov_b32_e32 v0, s4
1606 ; VI-NEXT: s_or_b32 s4, s2, s0
1607 ; VI-NEXT: s_sub_i32 s2, 0x3f1, s1
1608 ; VI-NEXT: v_med3_i32 v2, s2, 0, 13
1609 ; VI-NEXT: s_or_b32 s0, s4, 0x1000
1610 ; VI-NEXT: v_readfirstlane_b32 s2, v2
1611 ; VI-NEXT: s_lshr_b32 s2, s0, s2
1612 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
1613 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, s0, v2
1614 ; VI-NEXT: v_mov_b32_e32 v1, s5
1615 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
1616 ; VI-NEXT: s_add_i32 s5, s1, 0xfffffc10
1617 ; VI-NEXT: v_readfirstlane_b32 s0, v2
1618 ; VI-NEXT: s_lshl_b32 s1, s5, 12
1619 ; VI-NEXT: s_or_b32 s0, s2, s0
1620 ; VI-NEXT: s_or_b32 s1, s4, s1
1621 ; VI-NEXT: s_cmp_lt_i32 s5, 1
1622 ; VI-NEXT: s_cselect_b32 s6, s0, s1
1623 ; VI-NEXT: s_and_b32 s2, s6, 7
1624 ; VI-NEXT: s_cmp_gt_i32 s2, 5
1625 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1626 ; VI-NEXT: s_cmp_eq_u32 s2, 3
1627 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
1628 ; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1629 ; VI-NEXT: s_lshr_b32 s2, s6, 2
1630 ; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
1631 ; VI-NEXT: s_addc_u32 s0, s2, 0
1632 ; VI-NEXT: s_cmp_lt_i32 s5, 31
1633 ; VI-NEXT: s_cselect_b32 s2, s0, 0x7c00
1634 ; VI-NEXT: s_cmp_lg_u32 s4, 0
1635 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
1636 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1637 ; VI-NEXT: v_lshlrev_b32_e32 v2, 9, v2
1638 ; VI-NEXT: s_cmpk_eq_i32 s5, 0x40f
1639 ; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2
1640 ; VI-NEXT: v_mov_b32_e32 v3, s2
1641 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1642 ; VI-NEXT: s_lshr_b32 s0, s7, 16
1643 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
1644 ; VI-NEXT: s_and_b32 s0, s0, 0x8000
1645 ; VI-NEXT: v_or_b32_e32 v2, s0, v2
1646 ; VI-NEXT: s_movk_i32 s0, 0x7fff
1647 ; VI-NEXT: v_mov_b32_e32 v3, s8
1648 ; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
1649 ; VI-NEXT: flat_store_short v[0:1], v2
1652 ; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1654 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1655 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1656 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1657 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1658 ; GFX9-NEXT: s_lshr_b32 s0, s7, 8
1659 ; GFX9-NEXT: s_and_b32 s1, s7, 0x1ff
1660 ; GFX9-NEXT: s_and_b32 s2, s0, 0xffe
1661 ; GFX9-NEXT: s_or_b32 s0, s1, s6
1662 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0
1663 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1664 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1665 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1666 ; GFX9-NEXT: s_bfe_u32 s1, s7, 0xb0014
1667 ; GFX9-NEXT: s_or_b32 s6, s2, s0
1668 ; GFX9-NEXT: s_sub_i32 s2, 0x3f1, s1
1669 ; GFX9-NEXT: v_med3_i32 v1, s2, 0, 13
1670 ; GFX9-NEXT: s_or_b32 s0, s6, 0x1000
1671 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
1672 ; GFX9-NEXT: s_lshr_b32 s2, s0, s2
1673 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
1674 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s0, v1
1675 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
1676 ; GFX9-NEXT: s_add_i32 s9, s1, 0xfffffc10
1677 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1
1678 ; GFX9-NEXT: s_lshl_b32 s1, s9, 12
1679 ; GFX9-NEXT: s_or_b32 s0, s2, s0
1680 ; GFX9-NEXT: s_or_b32 s1, s6, s1
1681 ; GFX9-NEXT: s_cmp_lt_i32 s9, 1
1682 ; GFX9-NEXT: s_cselect_b32 s10, s0, s1
1683 ; GFX9-NEXT: s_and_b32 s2, s10, 7
1684 ; GFX9-NEXT: s_cmp_gt_i32 s2, 5
1685 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1686 ; GFX9-NEXT: s_cmp_eq_u32 s2, 3
1687 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
1688 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
1689 ; GFX9-NEXT: s_lshr_b32 s2, s10, 2
1690 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
1691 ; GFX9-NEXT: s_addc_u32 s0, s2, 0
1692 ; GFX9-NEXT: s_cmp_lt_i32 s9, 31
1693 ; GFX9-NEXT: s_cselect_b32 s2, s0, 0x7c00
1694 ; GFX9-NEXT: s_cmp_lg_u32 s6, 0
1695 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
1696 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1697 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1
1698 ; GFX9-NEXT: s_cmpk_eq_i32 s9, 0x40f
1699 ; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1
1700 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1701 ; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
1702 ; GFX9-NEXT: s_lshr_b32 s0, s7, 16
1703 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
1704 ; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
1705 ; GFX9-NEXT: v_or_b32_e32 v1, s0, v1
1706 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1707 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
1708 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
1709 ; GFX9-NEXT: global_store_short v0, v1, s[4:5]
1710 ; GFX9-NEXT: s_endpgm
1712 ; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
1714 ; GFX11-NEXT: s_clause 0x1
1715 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1716 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
1717 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1718 ; GFX11-NEXT: s_and_b32 s1, s7, 0x1ff
1719 ; GFX11-NEXT: s_lshr_b32 s2, s7, 8
1720 ; GFX11-NEXT: s_or_b32 s1, s1, s6
1721 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffe
1722 ; GFX11-NEXT: s_cmp_lg_u32 s1, 0
1723 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0
1724 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1725 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
1726 ; GFX11-NEXT: s_bfe_u32 s1, s7, 0xb0014
1727 ; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s1
1728 ; GFX11-NEXT: s_addk_i32 s1, 0xfc10
1729 ; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
1730 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1731 ; GFX11-NEXT: s_lshl_b32 s8, s1, 12
1732 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1733 ; GFX11-NEXT: v_readfirstlane_b32 s6, v1
1734 ; GFX11-NEXT: s_or_b32 s2, s2, s3
1735 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1736 ; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
1737 ; GFX11-NEXT: s_or_b32 s8, s2, s8
1738 ; GFX11-NEXT: s_lshr_b32 s6, s3, s6
1739 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1740 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6
1741 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1742 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s3, v0
1743 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
1744 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1745 ; GFX11-NEXT: v_readfirstlane_b32 s3, v0
1746 ; GFX11-NEXT: s_or_b32 s3, s6, s3
1747 ; GFX11-NEXT: s_cmp_lt_i32 s1, 1
1748 ; GFX11-NEXT: s_cselect_b32 s3, s3, s8
1749 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1750 ; GFX11-NEXT: s_and_b32 s6, s3, 7
1751 ; GFX11-NEXT: s_cmp_gt_i32 s6, 5
1752 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0
1753 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
1754 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0
1755 ; GFX11-NEXT: s_lshr_b32 s3, s3, 2
1756 ; GFX11-NEXT: s_or_b32 s6, s6, s8
1757 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1758 ; GFX11-NEXT: s_cmp_lg_u32 s6, 0
1759 ; GFX11-NEXT: s_addc_u32 s3, s3, 0
1760 ; GFX11-NEXT: s_cmp_lt_i32 s1, 31
1761 ; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00
1762 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1763 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0
1764 ; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x40f
1765 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
1766 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
1767 ; GFX11-NEXT: s_lshr_b32 s1, s7, 16
1768 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1769 ; GFX11-NEXT: s_and_b32 s1, s1, 0x8000
1770 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0
1771 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1772 ; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0
1773 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo
1774 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1775 ; GFX11-NEXT: v_or_b32_e32 v0, s1, v0
1776 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
1777 ; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
1778 ; GFX11-NEXT: s_nop 0
1779 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1780 ; GFX11-NEXT: s_endpgm
1781 %mag.trunc = fptrunc double %mag to half
1782 %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
1783 store half %result, ptr addrspace(1) %arg_out
1787 define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) {
1788 ; SI-LABEL: s_copysign_v2f16:
1790 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1791 ; SI-NEXT: s_mov_b32 s7, 0xf000
1792 ; SI-NEXT: s_mov_b32 s6, -1
1793 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1794 ; SI-NEXT: s_lshr_b32 s4, s2, 16
1795 ; SI-NEXT: s_lshr_b32 s5, s3, 16
1796 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
1797 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
1798 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
1799 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s3
1800 ; SI-NEXT: s_brev_b32 s2, -2
1801 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
1802 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1803 ; SI-NEXT: v_bfi_b32 v1, s2, v2, v3
1804 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1805 ; SI-NEXT: s_mov_b32 s4, s0
1806 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1807 ; SI-NEXT: s_mov_b32 s5, s1
1808 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1809 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1812 ; VI-LABEL: s_copysign_v2f16:
1814 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1815 ; VI-NEXT: s_movk_i32 s4, 0x7fff
1816 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1817 ; VI-NEXT: v_mov_b32_e32 v0, s2
1818 ; VI-NEXT: v_mov_b32_e32 v1, s3
1819 ; VI-NEXT: s_lshr_b32 s3, s3, 16
1820 ; VI-NEXT: s_lshr_b32 s2, s2, 16
1821 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
1822 ; VI-NEXT: v_mov_b32_e32 v1, s2
1823 ; VI-NEXT: v_mov_b32_e32 v2, s3
1824 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
1825 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1826 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1827 ; VI-NEXT: v_mov_b32_e32 v0, s0
1828 ; VI-NEXT: v_mov_b32_e32 v1, s1
1829 ; VI-NEXT: flat_store_dword v[0:1], v2
1832 ; GFX9-LABEL: s_copysign_v2f16:
1834 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1835 ; GFX9-NEXT: s_movk_i32 s4, 0x7fff
1836 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1837 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1838 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1839 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
1840 ; GFX9-NEXT: s_lshr_b32 s3, s3, 16
1841 ; GFX9-NEXT: s_lshr_b32 s2, s2, 16
1842 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
1843 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1844 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1845 ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
1846 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1847 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
1848 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1849 ; GFX9-NEXT: s_endpgm
1851 ; GFX11-LABEL: s_copysign_v2f16:
1853 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1854 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1855 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1856 ; GFX11-NEXT: v_mov_b32_e32 v0, s3
1857 ; GFX11-NEXT: s_lshr_b32 s3, s3, 16
1858 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1859 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
1860 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
1861 ; GFX11-NEXT: s_lshr_b32 s2, s2, 16
1862 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1863 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
1864 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1865 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1866 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1867 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
1868 ; GFX11-NEXT: s_nop 0
1869 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1870 ; GFX11-NEXT: s_endpgm
1871 %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
1872 store <2 x half> %out, ptr addrspace(1) %arg_out
1876 define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) {
1877 ; SI-LABEL: s_copysign_v3f16:
1879 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
1880 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1881 ; SI-NEXT: s_mov_b32 s3, 0xf000
1882 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1883 ; SI-NEXT: s_lshr_b32 s2, s4, 16
1884 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
1885 ; SI-NEXT: s_lshr_b32 s2, s6, 16
1886 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
1887 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s5
1888 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s4
1889 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s7
1890 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
1891 ; SI-NEXT: s_brev_b32 s2, -2
1892 ; SI-NEXT: v_bfi_b32 v2, s2, v2, v3
1893 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1894 ; SI-NEXT: v_bfi_b32 v1, s2, v1, v5
1895 ; SI-NEXT: v_bfi_b32 v0, s2, v0, v4
1896 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
1897 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1898 ; SI-NEXT: s_mov_b32 s2, -1
1899 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1900 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
1901 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
1902 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
1905 ; VI-LABEL: s_copysign_v3f16:
1907 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
1908 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1909 ; VI-NEXT: s_movk_i32 s2, 0x7fff
1910 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1911 ; VI-NEXT: v_mov_b32_e32 v0, s4
1912 ; VI-NEXT: v_mov_b32_e32 v1, s6
1913 ; VI-NEXT: s_lshr_b32 s3, s6, 16
1914 ; VI-NEXT: s_lshr_b32 s4, s4, 16
1915 ; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
1916 ; VI-NEXT: v_mov_b32_e32 v1, s4
1917 ; VI-NEXT: v_mov_b32_e32 v2, s3
1918 ; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
1919 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1920 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1921 ; VI-NEXT: v_mov_b32_e32 v0, s5
1922 ; VI-NEXT: v_mov_b32_e32 v1, s7
1923 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
1924 ; VI-NEXT: s_add_u32 s2, s0, 4
1925 ; VI-NEXT: s_addc_u32 s3, s1, 0
1926 ; VI-NEXT: v_mov_b32_e32 v0, s2
1927 ; VI-NEXT: v_mov_b32_e32 v1, s3
1928 ; VI-NEXT: flat_store_short v[0:1], v3
1929 ; VI-NEXT: v_mov_b32_e32 v0, s0
1930 ; VI-NEXT: v_mov_b32_e32 v1, s1
1931 ; VI-NEXT: flat_store_dword v[0:1], v2
1934 ; GFX9-LABEL: s_copysign_v3f16:
1936 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
1937 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1938 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
1939 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1940 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1941 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1942 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
1943 ; GFX9-NEXT: s_lshr_b32 s1, s6, 16
1944 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16
1945 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
1946 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
1947 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1948 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
1949 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
1950 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
1951 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
1952 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
1953 ; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
1954 ; GFX9-NEXT: global_store_short v0, v2, s[2:3] offset:4
1955 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
1956 ; GFX9-NEXT: s_endpgm
1958 ; GFX11-LABEL: s_copysign_v3f16:
1960 ; GFX11-NEXT: s_clause 0x1
1961 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
1962 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1963 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1964 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1965 ; GFX11-NEXT: s_lshr_b32 s2, s6, 16
1966 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
1967 ; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
1968 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
1969 ; GFX11-NEXT: v_mov_b32_e32 v2, s7
1970 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
1971 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1972 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
1973 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
1974 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1975 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1976 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
1977 ; GFX11-NEXT: s_clause 0x1
1978 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
1979 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
1980 ; GFX11-NEXT: s_nop 0
1981 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1982 ; GFX11-NEXT: s_endpgm
1983 %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
1984 store <3 x half> %out, ptr addrspace(1) %arg_out
1988 define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) {
1989 ; SI-LABEL: s_copysign_v4f16:
1991 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
1992 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1993 ; SI-NEXT: s_mov_b32 s3, 0xf000
1994 ; SI-NEXT: s_mov_b32 s2, -1
1995 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1996 ; SI-NEXT: s_lshr_b32 s8, s4, 16
1997 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4
1998 ; SI-NEXT: s_lshr_b32 s4, s6, 16
1999 ; SI-NEXT: s_lshr_b32 s9, s5, 16
2000 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
2001 ; SI-NEXT: s_lshr_b32 s4, s7, 16
2002 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s8
2003 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s9
2004 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4
2005 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s5
2006 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6
2007 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7
2008 ; SI-NEXT: s_brev_b32 s4, -2
2009 ; SI-NEXT: v_bfi_b32 v1, s4, v1, v5
2010 ; SI-NEXT: v_bfi_b32 v0, s4, v0, v4
2011 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
2012 ; SI-NEXT: v_bfi_b32 v3, s4, v3, v7
2013 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
2014 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v6
2015 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
2016 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
2017 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2018 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
2019 ; SI-NEXT: v_or_b32_e32 v1, v3, v1
2020 ; SI-NEXT: v_or_b32_e32 v0, v2, v0
2021 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2024 ; VI-LABEL: s_copysign_v4f16:
2026 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
2027 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2028 ; VI-NEXT: s_movk_i32 s2, 0x7fff
2029 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2030 ; VI-NEXT: v_mov_b32_e32 v0, s5
2031 ; VI-NEXT: v_mov_b32_e32 v1, s7
2032 ; VI-NEXT: s_lshr_b32 s3, s7, 16
2033 ; VI-NEXT: s_lshr_b32 s5, s5, 16
2034 ; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
2035 ; VI-NEXT: v_mov_b32_e32 v1, s5
2036 ; VI-NEXT: v_mov_b32_e32 v2, s3
2037 ; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
2038 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2039 ; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2040 ; VI-NEXT: v_mov_b32_e32 v0, s4
2041 ; VI-NEXT: v_mov_b32_e32 v2, s6
2042 ; VI-NEXT: s_lshr_b32 s3, s6, 16
2043 ; VI-NEXT: s_lshr_b32 s4, s4, 16
2044 ; VI-NEXT: v_bfi_b32 v0, s2, v0, v2
2045 ; VI-NEXT: v_mov_b32_e32 v2, s4
2046 ; VI-NEXT: v_mov_b32_e32 v3, s3
2047 ; VI-NEXT: v_bfi_b32 v2, s2, v2, v3
2048 ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2049 ; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2050 ; VI-NEXT: v_mov_b32_e32 v3, s1
2051 ; VI-NEXT: v_mov_b32_e32 v2, s0
2052 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2055 ; GFX9-LABEL: s_copysign_v4f16:
2057 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
2058 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2059 ; GFX9-NEXT: s_movk_i32 s0, 0x7fff
2060 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2061 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2062 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
2063 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2064 ; GFX9-NEXT: s_lshr_b32 s1, s7, 16
2065 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16
2066 ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
2067 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2068 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
2069 ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v3
2070 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2071 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0
2072 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2073 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
2074 ; GFX9-NEXT: s_lshr_b32 s1, s6, 16
2075 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16
2076 ; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v3
2077 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
2078 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
2079 ; GFX9-NEXT: v_bfi_b32 v3, s0, v3, v4
2080 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
2081 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
2082 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2083 ; GFX9-NEXT: s_endpgm
2085 ; GFX11-LABEL: s_copysign_v4f16:
2087 ; GFX11-NEXT: s_clause 0x1
2088 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
2089 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2090 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2091 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
2092 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
2093 ; GFX11-NEXT: s_lshr_b32 s2, s7, 16
2094 ; GFX11-NEXT: s_lshr_b32 s6, s6, 16
2095 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2096 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6
2097 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0
2098 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1
2099 ; GFX11-NEXT: s_lshr_b32 s3, s5, 16
2100 ; GFX11-NEXT: s_lshr_b32 s2, s4, 16
2101 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
2102 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
2103 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
2104 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
2105 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2106 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
2107 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
2108 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
2109 ; GFX11-NEXT: s_nop 0
2110 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2111 ; GFX11-NEXT: s_endpgm
2112 %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
2113 store <4 x half> %out, ptr addrspace(1) %arg_out
2117 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }