1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
6 define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) {
7 ; SI-LABEL: s_test_copysign_f32:
9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
10 ; SI-NEXT: s_brev_b32 s8, -2
11 ; SI-NEXT: s_mov_b32 s7, 0xf000
12 ; SI-NEXT: s_mov_b32 s6, -1
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: v_mov_b32_e32 v0, s2
15 ; SI-NEXT: v_mov_b32_e32 v1, s3
16 ; SI-NEXT: s_mov_b32 s4, s0
17 ; SI-NEXT: s_mov_b32 s5, s1
18 ; SI-NEXT: v_bfi_b32 v0, s8, v0, v1
19 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
22 ; VI-LABEL: s_test_copysign_f32:
24 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
25 ; VI-NEXT: s_brev_b32 s4, -2
26 ; VI-NEXT: s_waitcnt lgkmcnt(0)
27 ; VI-NEXT: v_mov_b32_e32 v0, s2
28 ; VI-NEXT: v_mov_b32_e32 v1, s3
29 ; VI-NEXT: v_bfi_b32 v2, s4, v0, v1
30 ; VI-NEXT: v_mov_b32_e32 v0, s0
31 ; VI-NEXT: v_mov_b32_e32 v1, s1
32 ; VI-NEXT: flat_store_dword v[0:1], v2
35 ; GFX11-LABEL: s_test_copysign_f32:
37 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
38 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
40 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
41 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
42 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
44 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
45 ; GFX11-NEXT: s_endpgm
46 %result = call float @llvm.copysign.f32(float %mag, float %sign)
47 store float %result, ptr addrspace(1) %out, align 4
51 define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) {
52 ; SI-LABEL: s_test_copysign_f32_0:
54 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
55 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
56 ; SI-NEXT: s_mov_b32 s3, 0xf000
57 ; SI-NEXT: s_mov_b32 s2, -1
58 ; SI-NEXT: s_waitcnt lgkmcnt(0)
59 ; SI-NEXT: s_bitset0_b32 s4, 31
60 ; SI-NEXT: v_mov_b32_e32 v0, s4
61 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
64 ; VI-LABEL: s_test_copysign_f32_0:
66 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
67 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
68 ; VI-NEXT: s_waitcnt lgkmcnt(0)
69 ; VI-NEXT: s_bitset0_b32 s2, 31
70 ; VI-NEXT: v_mov_b32_e32 v0, s0
71 ; VI-NEXT: v_mov_b32_e32 v1, s1
72 ; VI-NEXT: v_mov_b32_e32 v2, s2
73 ; VI-NEXT: flat_store_dword v[0:1], v2
76 ; GFX11-LABEL: s_test_copysign_f32_0:
78 ; GFX11-NEXT: s_clause 0x1
79 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
80 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
81 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX11-NEXT: s_bitset0_b32 s2, 31
83 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
84 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
85 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
87 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
88 ; GFX11-NEXT: s_endpgm
89 %result = call float @llvm.copysign.f32(float %mag, float 0.0)
90 store float %result, ptr addrspace(1) %out, align 4
94 define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) {
95 ; SI-LABEL: s_test_copysign_f32_1:
97 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
98 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
99 ; SI-NEXT: s_mov_b32 s3, 0xf000
100 ; SI-NEXT: s_mov_b32 s2, -1
101 ; SI-NEXT: s_waitcnt lgkmcnt(0)
102 ; SI-NEXT: s_bitset0_b32 s4, 31
103 ; SI-NEXT: v_mov_b32_e32 v0, s4
104 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
107 ; VI-LABEL: s_test_copysign_f32_1:
109 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
110 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
111 ; VI-NEXT: s_waitcnt lgkmcnt(0)
112 ; VI-NEXT: s_bitset0_b32 s2, 31
113 ; VI-NEXT: v_mov_b32_e32 v0, s0
114 ; VI-NEXT: v_mov_b32_e32 v1, s1
115 ; VI-NEXT: v_mov_b32_e32 v2, s2
116 ; VI-NEXT: flat_store_dword v[0:1], v2
119 ; GFX11-LABEL: s_test_copysign_f32_1:
121 ; GFX11-NEXT: s_clause 0x1
122 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
123 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
124 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX11-NEXT: s_bitset0_b32 s2, 31
126 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
127 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
128 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
129 ; GFX11-NEXT: s_nop 0
130 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
131 ; GFX11-NEXT: s_endpgm
132 %result = call float @llvm.copysign.f32(float %mag, float 1.0)
133 store float %result, ptr addrspace(1) %out, align 4
137 define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) {
138 ; SI-LABEL: s_test_copysign_f32_10.0:
140 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
141 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
142 ; SI-NEXT: s_mov_b32 s3, 0xf000
143 ; SI-NEXT: s_mov_b32 s2, -1
144 ; SI-NEXT: s_waitcnt lgkmcnt(0)
145 ; SI-NEXT: s_bitset0_b32 s4, 31
146 ; SI-NEXT: v_mov_b32_e32 v0, s4
147 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
150 ; VI-LABEL: s_test_copysign_f32_10.0:
152 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
153 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
154 ; VI-NEXT: s_waitcnt lgkmcnt(0)
155 ; VI-NEXT: s_bitset0_b32 s2, 31
156 ; VI-NEXT: v_mov_b32_e32 v0, s0
157 ; VI-NEXT: v_mov_b32_e32 v1, s1
158 ; VI-NEXT: v_mov_b32_e32 v2, s2
159 ; VI-NEXT: flat_store_dword v[0:1], v2
162 ; GFX11-LABEL: s_test_copysign_f32_10.0:
164 ; GFX11-NEXT: s_clause 0x1
165 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
166 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
167 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
168 ; GFX11-NEXT: s_bitset0_b32 s2, 31
169 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
170 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
171 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
172 ; GFX11-NEXT: s_nop 0
173 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
174 ; GFX11-NEXT: s_endpgm
175 %result = call float @llvm.copysign.f32(float %mag, float 10.0)
176 store float %result, ptr addrspace(1) %out, align 4
180 define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) {
181 ; SI-LABEL: s_test_copysign_f32_neg1:
183 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
184 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
185 ; SI-NEXT: s_mov_b32 s3, 0xf000
186 ; SI-NEXT: s_mov_b32 s2, -1
187 ; SI-NEXT: s_waitcnt lgkmcnt(0)
188 ; SI-NEXT: s_bitset1_b32 s4, 31
189 ; SI-NEXT: v_mov_b32_e32 v0, s4
190 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
193 ; VI-LABEL: s_test_copysign_f32_neg1:
195 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
196 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
197 ; VI-NEXT: s_waitcnt lgkmcnt(0)
198 ; VI-NEXT: s_bitset1_b32 s2, 31
199 ; VI-NEXT: v_mov_b32_e32 v0, s0
200 ; VI-NEXT: v_mov_b32_e32 v1, s1
201 ; VI-NEXT: v_mov_b32_e32 v2, s2
202 ; VI-NEXT: flat_store_dword v[0:1], v2
205 ; GFX11-LABEL: s_test_copysign_f32_neg1:
207 ; GFX11-NEXT: s_clause 0x1
208 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
209 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
210 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX11-NEXT: s_bitset1_b32 s2, 31
212 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
213 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
214 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
215 ; GFX11-NEXT: s_nop 0
216 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
217 ; GFX11-NEXT: s_endpgm
218 %result = call float @llvm.copysign.f32(float %mag, float -1.0)
219 store float %result, ptr addrspace(1) %out, align 4
223 define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) {
224 ; SI-LABEL: s_test_copysign_f32_neg10:
226 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
227 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
228 ; SI-NEXT: s_mov_b32 s3, 0xf000
229 ; SI-NEXT: s_mov_b32 s2, -1
230 ; SI-NEXT: s_waitcnt lgkmcnt(0)
231 ; SI-NEXT: s_bitset1_b32 s4, 31
232 ; SI-NEXT: v_mov_b32_e32 v0, s4
233 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
236 ; VI-LABEL: s_test_copysign_f32_neg10:
238 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
239 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
240 ; VI-NEXT: s_waitcnt lgkmcnt(0)
241 ; VI-NEXT: s_bitset1_b32 s2, 31
242 ; VI-NEXT: v_mov_b32_e32 v0, s0
243 ; VI-NEXT: v_mov_b32_e32 v1, s1
244 ; VI-NEXT: v_mov_b32_e32 v2, s2
245 ; VI-NEXT: flat_store_dword v[0:1], v2
248 ; GFX11-LABEL: s_test_copysign_f32_neg10:
250 ; GFX11-NEXT: s_clause 0x1
251 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
252 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
253 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
254 ; GFX11-NEXT: s_bitset1_b32 s2, 31
255 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
256 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
257 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
258 ; GFX11-NEXT: s_nop 0
259 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
260 ; GFX11-NEXT: s_endpgm
261 %result = call float @llvm.copysign.f32(float %mag, float -10.0)
262 store float %result, ptr addrspace(1) %out, align 4
266 define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) {
267 ; SI-LABEL: s_test_copysign_f32_0_mag:
269 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
270 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
271 ; SI-NEXT: s_mov_b32 s3, 0xf000
272 ; SI-NEXT: s_mov_b32 s2, -1
273 ; SI-NEXT: s_waitcnt lgkmcnt(0)
274 ; SI-NEXT: s_and_b32 s4, s4, 0x80000000
275 ; SI-NEXT: v_mov_b32_e32 v0, s4
276 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
279 ; VI-LABEL: s_test_copysign_f32_0_mag:
281 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
282 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
283 ; VI-NEXT: s_waitcnt lgkmcnt(0)
284 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
285 ; VI-NEXT: v_mov_b32_e32 v0, s0
286 ; VI-NEXT: v_mov_b32_e32 v1, s1
287 ; VI-NEXT: v_mov_b32_e32 v2, s2
288 ; VI-NEXT: flat_store_dword v[0:1], v2
291 ; GFX11-LABEL: s_test_copysign_f32_0_mag:
293 ; GFX11-NEXT: s_clause 0x1
294 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
295 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
296 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
297 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
298 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
299 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
300 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
301 ; GFX11-NEXT: s_nop 0
302 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
303 ; GFX11-NEXT: s_endpgm
304 %result = call float @llvm.copysign.f32(float 0.0, float %sign)
305 store float %result, ptr addrspace(1) %out, align 4
310 define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) {
311 ; SI-LABEL: s_test_copysign_f32_1_mag:
313 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
314 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
315 ; SI-NEXT: s_mov_b32 s3, 0xf000
316 ; SI-NEXT: s_mov_b32 s2, -1
317 ; SI-NEXT: s_waitcnt lgkmcnt(0)
318 ; SI-NEXT: s_and_b32 s4, s4, 0x80000000
319 ; SI-NEXT: s_or_b32 s4, s4, 1.0
320 ; SI-NEXT: v_mov_b32_e32 v0, s4
321 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
324 ; VI-LABEL: s_test_copysign_f32_1_mag:
326 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
327 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
328 ; VI-NEXT: s_waitcnt lgkmcnt(0)
329 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
330 ; VI-NEXT: s_or_b32 s2, s2, 1.0
331 ; VI-NEXT: v_mov_b32_e32 v0, s0
332 ; VI-NEXT: v_mov_b32_e32 v1, s1
333 ; VI-NEXT: v_mov_b32_e32 v2, s2
334 ; VI-NEXT: flat_store_dword v[0:1], v2
337 ; GFX11-LABEL: s_test_copysign_f32_1_mag:
339 ; GFX11-NEXT: s_clause 0x1
340 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
341 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
342 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
343 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
344 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
345 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0
346 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
347 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
348 ; GFX11-NEXT: s_nop 0
349 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
350 ; GFX11-NEXT: s_endpgm
351 %result = call float @llvm.copysign.f32(float 1.0, float %sign)
352 store float %result, ptr addrspace(1) %out, align 4
356 define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) {
357 ; SI-LABEL: s_test_copysign_f32_10_mag:
359 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
360 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
361 ; SI-NEXT: s_mov_b32 s3, 0xf000
362 ; SI-NEXT: s_mov_b32 s2, -1
363 ; SI-NEXT: s_waitcnt lgkmcnt(0)
364 ; SI-NEXT: s_and_b32 s4, s4, 0x80000000
365 ; SI-NEXT: s_or_b32 s4, s4, 0x41200000
366 ; SI-NEXT: v_mov_b32_e32 v0, s4
367 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
370 ; VI-LABEL: s_test_copysign_f32_10_mag:
372 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
373 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
374 ; VI-NEXT: s_waitcnt lgkmcnt(0)
375 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
376 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000
377 ; VI-NEXT: v_mov_b32_e32 v0, s0
378 ; VI-NEXT: v_mov_b32_e32 v1, s1
379 ; VI-NEXT: v_mov_b32_e32 v2, s2
380 ; VI-NEXT: flat_store_dword v[0:1], v2
383 ; GFX11-LABEL: s_test_copysign_f32_10_mag:
385 ; GFX11-NEXT: s_clause 0x1
386 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
387 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
388 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
390 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
391 ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
392 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
393 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
394 ; GFX11-NEXT: s_nop 0
395 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
396 ; GFX11-NEXT: s_endpgm
397 %result = call float @llvm.copysign.f32(float 10.0, float %sign)
398 store float %result, ptr addrspace(1) %out, align 4
402 define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) {
403 ; SI-LABEL: s_test_copysign_f32_neg1_mag:
405 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
406 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
407 ; SI-NEXT: s_mov_b32 s3, 0xf000
408 ; SI-NEXT: s_mov_b32 s2, -1
409 ; SI-NEXT: s_waitcnt lgkmcnt(0)
410 ; SI-NEXT: s_and_b32 s4, s4, 0x80000000
411 ; SI-NEXT: s_or_b32 s4, s4, 1.0
412 ; SI-NEXT: v_mov_b32_e32 v0, s4
413 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
416 ; VI-LABEL: s_test_copysign_f32_neg1_mag:
418 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
419 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
420 ; VI-NEXT: s_waitcnt lgkmcnt(0)
421 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
422 ; VI-NEXT: s_or_b32 s2, s2, 1.0
423 ; VI-NEXT: v_mov_b32_e32 v0, s0
424 ; VI-NEXT: v_mov_b32_e32 v1, s1
425 ; VI-NEXT: v_mov_b32_e32 v2, s2
426 ; VI-NEXT: flat_store_dword v[0:1], v2
429 ; GFX11-LABEL: s_test_copysign_f32_neg1_mag:
431 ; GFX11-NEXT: s_clause 0x1
432 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
433 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
434 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
435 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
436 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
437 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0
438 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
439 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
440 ; GFX11-NEXT: s_nop 0
441 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
442 ; GFX11-NEXT: s_endpgm
443 %result = call float @llvm.copysign.f32(float -1.0, float %sign)
444 store float %result, ptr addrspace(1) %out, align 4
448 define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) {
449 ; SI-LABEL: s_test_copysign_f32_neg10_mag:
451 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
452 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
453 ; SI-NEXT: s_mov_b32 s3, 0xf000
454 ; SI-NEXT: s_mov_b32 s2, -1
455 ; SI-NEXT: s_waitcnt lgkmcnt(0)
456 ; SI-NEXT: s_and_b32 s4, s4, 0x80000000
457 ; SI-NEXT: s_or_b32 s4, s4, 0x41200000
458 ; SI-NEXT: v_mov_b32_e32 v0, s4
459 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
462 ; VI-LABEL: s_test_copysign_f32_neg10_mag:
464 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
465 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
466 ; VI-NEXT: s_waitcnt lgkmcnt(0)
467 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
468 ; VI-NEXT: s_or_b32 s2, s2, 0x41200000
469 ; VI-NEXT: v_mov_b32_e32 v0, s0
470 ; VI-NEXT: v_mov_b32_e32 v1, s1
471 ; VI-NEXT: v_mov_b32_e32 v2, s2
472 ; VI-NEXT: flat_store_dword v[0:1], v2
475 ; GFX11-LABEL: s_test_copysign_f32_neg10_mag:
477 ; GFX11-NEXT: s_clause 0x1
478 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
479 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
480 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
482 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
483 ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
484 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
485 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
486 ; GFX11-NEXT: s_nop 0
487 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
488 ; GFX11-NEXT: s_endpgm
489 %result = call float @llvm.copysign.f32(float -10.0, float %sign)
490 store float %result, ptr addrspace(1) %out, align 4
494 define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) {
495 ; SI-LABEL: s_test_copysign_v2f32:
497 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
498 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
499 ; SI-NEXT: s_brev_b32 s8, -2
500 ; SI-NEXT: s_mov_b32 s3, 0xf000
501 ; SI-NEXT: s_mov_b32 s2, -1
502 ; SI-NEXT: s_waitcnt lgkmcnt(0)
503 ; SI-NEXT: v_mov_b32_e32 v0, s5
504 ; SI-NEXT: v_mov_b32_e32 v1, s7
505 ; SI-NEXT: v_bfi_b32 v1, s8, v0, v1
506 ; SI-NEXT: v_mov_b32_e32 v0, s4
507 ; SI-NEXT: v_mov_b32_e32 v2, s6
508 ; SI-NEXT: v_bfi_b32 v0, s8, v0, v2
509 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
512 ; VI-LABEL: s_test_copysign_v2f32:
514 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
515 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
516 ; VI-NEXT: s_brev_b32 s2, -2
517 ; VI-NEXT: s_waitcnt lgkmcnt(0)
518 ; VI-NEXT: v_mov_b32_e32 v0, s5
519 ; VI-NEXT: v_mov_b32_e32 v1, s7
520 ; VI-NEXT: v_mov_b32_e32 v2, s4
521 ; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
522 ; VI-NEXT: v_mov_b32_e32 v0, s6
523 ; VI-NEXT: v_bfi_b32 v0, s2, v2, v0
524 ; VI-NEXT: v_mov_b32_e32 v3, s1
525 ; VI-NEXT: v_mov_b32_e32 v2, s0
526 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
529 ; GFX11-LABEL: s_test_copysign_v2f32:
531 ; GFX11-NEXT: s_clause 0x1
532 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
533 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
534 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
536 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
537 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
538 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
539 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2
540 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
541 ; GFX11-NEXT: s_nop 0
542 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
543 ; GFX11-NEXT: s_endpgm
544 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
545 store <2 x float> %result, ptr addrspace(1) %out, align 8
549 define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) {
550 ; SI-LABEL: s_test_copysign_v3f32:
552 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
553 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
555 ; SI-NEXT: s_brev_b32 s7, -2
556 ; SI-NEXT: s_mov_b32 s3, 0xf000
557 ; SI-NEXT: s_mov_b32 s2, -1
558 ; SI-NEXT: v_mov_b32_e32 v0, s5
559 ; SI-NEXT: v_mov_b32_e32 v1, s9
560 ; SI-NEXT: v_bfi_b32 v1, s7, v0, v1
561 ; SI-NEXT: v_mov_b32_e32 v0, s4
562 ; SI-NEXT: v_mov_b32_e32 v2, s8
563 ; SI-NEXT: v_bfi_b32 v0, s7, v0, v2
564 ; SI-NEXT: v_mov_b32_e32 v2, s6
565 ; SI-NEXT: v_mov_b32_e32 v3, s10
566 ; SI-NEXT: v_bfi_b32 v2, s7, v2, v3
567 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
568 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
571 ; VI-LABEL: s_test_copysign_v3f32:
573 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
574 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
575 ; VI-NEXT: s_brev_b32 s2, -2
576 ; VI-NEXT: s_waitcnt lgkmcnt(0)
577 ; VI-NEXT: v_mov_b32_e32 v0, s6
578 ; VI-NEXT: v_mov_b32_e32 v1, s10
579 ; VI-NEXT: v_mov_b32_e32 v3, s5
580 ; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
581 ; VI-NEXT: v_mov_b32_e32 v0, s9
582 ; VI-NEXT: v_bfi_b32 v1, s2, v3, v0
583 ; VI-NEXT: v_mov_b32_e32 v0, s4
584 ; VI-NEXT: v_mov_b32_e32 v3, s8
585 ; VI-NEXT: v_bfi_b32 v0, s2, v0, v3
586 ; VI-NEXT: v_mov_b32_e32 v4, s1
587 ; VI-NEXT: v_mov_b32_e32 v3, s0
588 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
591 ; GFX11-LABEL: s_test_copysign_v3f32:
593 ; GFX11-NEXT: s_clause 0x1
594 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
595 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
596 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
597 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9
599 ; GFX11-NEXT: v_mov_b32_e32 v3, s8
600 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
601 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v0
602 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1
603 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
604 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3
605 ; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1]
606 ; GFX11-NEXT: s_nop 0
607 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608 ; GFX11-NEXT: s_endpgm
609 %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
610 store <3 x float> %result, ptr addrspace(1) %out, align 16
614 define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) {
615 ; SI-LABEL: s_test_copysign_v4f32:
617 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd
618 ; SI-NEXT: s_brev_b32 s12, -2
619 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
620 ; SI-NEXT: s_mov_b32 s3, 0xf000
621 ; SI-NEXT: s_mov_b32 s2, -1
622 ; SI-NEXT: s_waitcnt lgkmcnt(0)
623 ; SI-NEXT: v_mov_b32_e32 v0, s7
624 ; SI-NEXT: v_mov_b32_e32 v1, s11
625 ; SI-NEXT: v_bfi_b32 v3, s12, v0, v1
626 ; SI-NEXT: v_mov_b32_e32 v0, s6
627 ; SI-NEXT: v_mov_b32_e32 v1, s10
628 ; SI-NEXT: v_bfi_b32 v2, s12, v0, v1
629 ; SI-NEXT: v_mov_b32_e32 v0, s5
630 ; SI-NEXT: v_mov_b32_e32 v1, s9
631 ; SI-NEXT: v_bfi_b32 v1, s12, v0, v1
632 ; SI-NEXT: v_mov_b32_e32 v0, s4
633 ; SI-NEXT: v_mov_b32_e32 v4, s8
634 ; SI-NEXT: v_bfi_b32 v0, s12, v0, v4
635 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
638 ; VI-LABEL: s_test_copysign_v4f32:
640 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
641 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
642 ; VI-NEXT: s_brev_b32 s2, -2
643 ; VI-NEXT: s_waitcnt lgkmcnt(0)
644 ; VI-NEXT: v_mov_b32_e32 v0, s7
645 ; VI-NEXT: v_mov_b32_e32 v1, s11
646 ; VI-NEXT: v_mov_b32_e32 v2, s6
647 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
648 ; VI-NEXT: v_mov_b32_e32 v0, s10
649 ; VI-NEXT: v_bfi_b32 v2, s2, v2, v0
650 ; VI-NEXT: v_mov_b32_e32 v0, s5
651 ; VI-NEXT: v_mov_b32_e32 v1, s9
652 ; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
653 ; VI-NEXT: v_mov_b32_e32 v0, s4
654 ; VI-NEXT: v_mov_b32_e32 v4, s8
655 ; VI-NEXT: v_bfi_b32 v0, s2, v0, v4
656 ; VI-NEXT: v_mov_b32_e32 v5, s1
657 ; VI-NEXT: v_mov_b32_e32 v4, s0
658 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
661 ; GFX11-LABEL: s_test_copysign_v4f32:
663 ; GFX11-NEXT: s_clause 0x1
664 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
665 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
666 ; GFX11-NEXT: v_mov_b32_e32 v6, 0
667 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
668 ; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10
669 ; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s8
670 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
671 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v0
672 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s6, v1
673 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
674 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4
675 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5
676 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
677 ; GFX11-NEXT: s_nop 0
678 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
679 ; GFX11-NEXT: s_endpgm
680 %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
681 store <4 x float> %result, ptr addrspace(1) %out, align 16
685 define float @v_test_copysign_f32(float %mag, float %sign) {
686 ; SIVI-LABEL: v_test_copysign_f32:
688 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689 ; SIVI-NEXT: s_brev_b32 s4, -2
690 ; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v1
691 ; SIVI-NEXT: s_setpc_b64 s[30:31]
693 ; GFX11-LABEL: v_test_copysign_f32:
695 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
696 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
697 ; GFX11-NEXT: s_setpc_b64 s[30:31]
698 %result = call float @llvm.copysign.f32(float %mag, float %sign)
702 define float @v_test_copysign_f32_0(float %mag) {
703 ; SIVI-LABEL: v_test_copysign_f32_0:
705 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
706 ; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
707 ; SIVI-NEXT: s_setpc_b64 s[30:31]
709 ; GFX11-LABEL: v_test_copysign_f32_0:
711 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
712 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
713 ; GFX11-NEXT: s_setpc_b64 s[30:31]
714 %result = call float @llvm.copysign.f32(float %mag, float 0.0)
718 define float @v_test_copysign_f32_1(float %mag) {
719 ; SIVI-LABEL: v_test_copysign_f32_1:
721 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
722 ; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
723 ; SIVI-NEXT: s_setpc_b64 s[30:31]
725 ; GFX11-LABEL: v_test_copysign_f32_1:
727 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
729 ; GFX11-NEXT: s_setpc_b64 s[30:31]
730 %result = call float @llvm.copysign.f32(float %mag, float 1.0)
734 define float @v_test_copysign_f32_10(float %mag) {
735 ; SIVI-LABEL: v_test_copysign_f32_10:
737 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
738 ; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
739 ; SIVI-NEXT: s_setpc_b64 s[30:31]
741 ; GFX11-LABEL: v_test_copysign_f32_10:
743 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
745 ; GFX11-NEXT: s_setpc_b64 s[30:31]
746 %result = call float @llvm.copysign.f32(float %mag, float 10.0)
750 define float @v_test_copysign_f32_neg1(float %mag) {
751 ; SIVI-LABEL: v_test_copysign_f32_neg1:
753 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
754 ; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
755 ; SIVI-NEXT: s_setpc_b64 s[30:31]
757 ; GFX11-LABEL: v_test_copysign_f32_neg1:
759 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
760 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
761 ; GFX11-NEXT: s_setpc_b64 s[30:31]
762 %result = call float @llvm.copysign.f32(float %mag, float -1.0)
766 define float @v_test_copysign_f32_neg10(float %mag) {
767 ; SIVI-LABEL: v_test_copysign_f32_neg10:
769 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
770 ; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
771 ; SIVI-NEXT: s_setpc_b64 s[30:31]
773 ; GFX11-LABEL: v_test_copysign_f32_neg10:
775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
777 ; GFX11-NEXT: s_setpc_b64 s[30:31]
778 %result = call float @llvm.copysign.f32(float %mag, float -10.0)
782 define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) {
783 ; SIVI-LABEL: v_test_copysign_v2f32:
785 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
786 ; SIVI-NEXT: s_brev_b32 s4, -2
787 ; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v2
788 ; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v3
789 ; SIVI-NEXT: s_setpc_b64 s[30:31]
791 ; GFX11-LABEL: v_test_copysign_v2f32:
793 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
794 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
795 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3
796 ; GFX11-NEXT: s_setpc_b64 s[30:31]
797 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
798 ret <2 x float> %result
801 define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) {
802 ; SIVI-LABEL: v_test_copysign_v2f32_0:
804 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805 ; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
806 ; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
807 ; SIVI-NEXT: s_setpc_b64 s[30:31]
809 ; GFX11-LABEL: v_test_copysign_v2f32_0:
811 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
812 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
813 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
814 ; GFX11-NEXT: s_setpc_b64 s[30:31]
815 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> zeroinitializer)
816 ret <2 x float> %result
819 define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) {
820 ; SIVI-LABEL: v_test_copysign_v2f32_neg1:
822 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
823 ; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0
824 ; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v1
825 ; SIVI-NEXT: s_setpc_b64 s[30:31]
827 ; GFX11-LABEL: v_test_copysign_v2f32_neg1:
829 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
830 ; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0
831 ; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1
832 ; GFX11-NEXT: s_setpc_b64 s[30:31]
833 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> <float -1.0, float -1.0>)
834 ret <2 x float> %result
837 define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) {
838 ; SIVI-LABEL: v_test_copysign_v3f32:
840 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841 ; SIVI-NEXT: s_brev_b32 s4, -2
842 ; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v3
843 ; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v4
844 ; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v5
845 ; SIVI-NEXT: s_setpc_b64 s[30:31]
847 ; GFX11-LABEL: v_test_copysign_v3f32:
849 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
850 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3
851 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4
852 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5
853 ; GFX11-NEXT: s_setpc_b64 s[30:31]
854 %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign)
855 ret <3 x float> %result
858 define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) {
859 ; SIVI-LABEL: v_test_copysign_v4f32:
861 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
862 ; SIVI-NEXT: s_brev_b32 s4, -2
863 ; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v4
864 ; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v5
865 ; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v6
866 ; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v7
867 ; SIVI-NEXT: s_setpc_b64 s[30:31]
869 ; GFX11-LABEL: v_test_copysign_v4f32:
871 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
872 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4
873 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5
874 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6
875 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7
876 ; GFX11-NEXT: s_setpc_b64 s[30:31]
877 %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
878 ret <4 x float> %result
881 define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) {
882 ; SIVI-LABEL: v_test_copysign_v5f32:
884 ; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885 ; SIVI-NEXT: s_brev_b32 s4, -2
886 ; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v5
887 ; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v6
888 ; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v7
889 ; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v8
890 ; SIVI-NEXT: v_bfi_b32 v4, s4, v4, v9
891 ; SIVI-NEXT: s_setpc_b64 s[30:31]
893 ; GFX11-LABEL: v_test_copysign_v5f32:
895 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
896 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5
897 ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
898 ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7
899 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
900 ; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v9
901 ; GFX11-NEXT: s_setpc_b64 s[30:31]
902 %result = call <5 x float> @llvm.copysign.v5f32(<5 x float> %mag, <5 x float> %sign)
903 ret <5 x float> %result
906 define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) {
907 ; SI-LABEL: s_test_copysign_f32_fptrunc_f64:
909 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
910 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
911 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: s_brev_b32 s0, -2
914 ; SI-NEXT: s_mov_b32 s7, 0xf000
915 ; SI-NEXT: s_mov_b32 s6, -1
916 ; SI-NEXT: v_mov_b32_e32 v0, s2
917 ; SI-NEXT: v_mov_b32_e32 v1, s1
918 ; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
919 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
922 ; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
924 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
925 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
926 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
927 ; VI-NEXT: s_waitcnt lgkmcnt(0)
928 ; VI-NEXT: s_brev_b32 s2, -2
929 ; VI-NEXT: v_mov_b32_e32 v0, s4
930 ; VI-NEXT: v_mov_b32_e32 v1, s3
931 ; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
932 ; VI-NEXT: v_mov_b32_e32 v0, s0
933 ; VI-NEXT: v_mov_b32_e32 v1, s1
934 ; VI-NEXT: flat_store_dword v[0:1], v2
937 ; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64:
939 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
940 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
941 ; GFX11-NEXT: s_clause 0x1
942 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
943 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
944 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
945 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
946 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
947 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
948 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
949 ; GFX11-NEXT: s_nop 0
950 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
951 ; GFX11-NEXT: s_endpgm
952 %sign.trunc = fptrunc double %sign to float
953 %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc)
954 store float %result, ptr addrspace(1) %out, align 4
958 define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) {
959 ; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64:
961 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
962 ; SI-NEXT: s_mov_b32 s7, 0xf000
963 ; SI-NEXT: s_mov_b32 s6, -1
964 ; SI-NEXT: s_waitcnt lgkmcnt(0)
965 ; SI-NEXT: s_mov_b32 s4, s0
966 ; SI-NEXT: s_and_b32 s0, s3, 0x80000000
967 ; SI-NEXT: s_or_b32 s0, s0, 1.0
968 ; SI-NEXT: s_mov_b32 s5, s1
969 ; SI-NEXT: v_mov_b32_e32 v0, s0
970 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
973 ; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64:
975 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
976 ; VI-NEXT: s_waitcnt lgkmcnt(0)
977 ; VI-NEXT: v_mov_b32_e32 v0, s0
978 ; VI-NEXT: s_and_b32 s0, s3, 0x80000000
979 ; VI-NEXT: s_or_b32 s0, s0, 1.0
980 ; VI-NEXT: v_mov_b32_e32 v1, s1
981 ; VI-NEXT: v_mov_b32_e32 v2, s0
982 ; VI-NEXT: flat_store_dword v[0:1], v2
985 ; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64:
987 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
988 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
989 ; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
990 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
991 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0
992 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
993 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
994 ; GFX11-NEXT: s_nop 0
995 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
996 ; GFX11-NEXT: s_endpgm
997 %sign.trunc = fptrunc double %sign to float
998 %result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc)
999 store float %result, ptr addrspace(1) %out, align 4
1003 define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) {
1004 ; SI-LABEL: s_test_copysign_f32_fpext_f16:
1006 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1007 ; SI-NEXT: s_mov_b32 s7, 0xf000
1008 ; SI-NEXT: s_mov_b32 s6, -1
1009 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1010 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
1011 ; SI-NEXT: s_mov_b32 s4, s0
1012 ; SI-NEXT: s_brev_b32 s0, -2
1013 ; SI-NEXT: v_mov_b32_e32 v1, s2
1014 ; SI-NEXT: s_mov_b32 s5, s1
1015 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
1016 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1019 ; VI-LABEL: s_test_copysign_f32_fpext_f16:
1021 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1022 ; VI-NEXT: s_brev_b32 s4, -2
1023 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1024 ; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3
1025 ; VI-NEXT: v_mov_b32_e32 v1, s2
1026 ; VI-NEXT: v_bfi_b32 v2, s4, v1, v0
1027 ; VI-NEXT: v_mov_b32_e32 v0, s0
1028 ; VI-NEXT: v_mov_b32_e32 v1, s1
1029 ; VI-NEXT: flat_store_dword v[0:1], v2
1032 ; GFX11-LABEL: s_test_copysign_f32_fpext_f16:
1034 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1035 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1036 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1037 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3
1038 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1039 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
1040 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1041 ; GFX11-NEXT: s_nop 0
1042 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1043 ; GFX11-NEXT: s_endpgm
1044 %sign.ext = fpext half %sign to float
1045 %result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
1046 store float %result, ptr addrspace(1) %out, align 4
1050 define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) {
1051 ; SI-LABEL: s_test_copysign_f32_1_fpext_f16:
1053 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
1054 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1055 ; SI-NEXT: s_mov_b32 s3, 0xf000
1056 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1057 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
1058 ; SI-NEXT: s_mov_b32 s2, -1
1059 ; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
1060 ; SI-NEXT: v_or_b32_e32 v0, 1.0, v0
1061 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1064 ; VI-LABEL: s_test_copysign_f32_1_fpext_f16:
1066 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1067 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1068 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1069 ; VI-NEXT: s_lshl_b32 s2, s2, 16
1070 ; VI-NEXT: s_and_b32 s2, s2, 0x80000000
1071 ; VI-NEXT: s_or_b32 s2, s2, 1.0
1072 ; VI-NEXT: v_mov_b32_e32 v0, s0
1073 ; VI-NEXT: v_mov_b32_e32 v1, s1
1074 ; VI-NEXT: v_mov_b32_e32 v2, s2
1075 ; VI-NEXT: flat_store_dword v[0:1], v2
1078 ; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16:
1080 ; GFX11-NEXT: s_clause 0x1
1081 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1082 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1083 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1084 ; GFX11-NEXT: s_lshl_b32 s2, s2, 16
1085 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1086 ; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
1087 ; GFX11-NEXT: s_or_b32 s2, s2, 1.0
1088 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1089 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
1090 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1091 ; GFX11-NEXT: s_nop 0
1092 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1093 ; GFX11-NEXT: s_endpgm
1094 %sign.ext = fpext half %sign to float
1095 %result = call float @llvm.copysign.f32(float 1.0, float %sign.ext)
1096 store float %result, ptr addrspace(1) %out, align 4
1100 define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) {
1101 ; SI-LABEL: s_test_copysign_f32_fpext_bf16:
1103 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1104 ; SI-NEXT: s_mov_b32 s7, 0xf000
1105 ; SI-NEXT: s_mov_b32 s6, -1
1106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1107 ; SI-NEXT: s_mov_b32 s4, s0
1108 ; SI-NEXT: s_lshl_b32 s0, s3, 16
1109 ; SI-NEXT: s_mov_b32 s5, s1
1110 ; SI-NEXT: s_brev_b32 s1, -2
1111 ; SI-NEXT: v_mov_b32_e32 v0, s2
1112 ; SI-NEXT: v_mov_b32_e32 v1, s0
1113 ; SI-NEXT: v_bfi_b32 v0, s1, v0, v1
1114 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1117 ; VI-LABEL: s_test_copysign_f32_fpext_bf16:
1119 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1120 ; VI-NEXT: s_brev_b32 s4, -2
1121 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1122 ; VI-NEXT: s_lshl_b32 s3, s3, 16
1123 ; VI-NEXT: v_mov_b32_e32 v0, s2
1124 ; VI-NEXT: v_mov_b32_e32 v1, s3
1125 ; VI-NEXT: v_bfi_b32 v2, s4, v0, v1
1126 ; VI-NEXT: v_mov_b32_e32 v0, s0
1127 ; VI-NEXT: v_mov_b32_e32 v1, s1
1128 ; VI-NEXT: flat_store_dword v[0:1], v2
1131 ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16:
1133 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
1134 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1135 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1136 ; GFX11-NEXT: s_lshl_b32 s3, s3, 16
1137 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1138 ; GFX11-NEXT: v_mov_b32_e32 v0, s3
1139 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
1140 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1141 ; GFX11-NEXT: s_nop 0
1142 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1143 ; GFX11-NEXT: s_endpgm
1144 %sign.ext = fpext bfloat %sign to float
1145 %result = call float @llvm.copysign.f32(float %mag, float %sign.ext)
1146 store float %result, ptr addrspace(1) %out, align 4
1150 declare float @llvm.copysign.f32(float, float) #0
1151 declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) #0
1152 declare <3 x float> @llvm.copysign.v3f32(<3 x float>, <3 x float>) #0
1153 declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) #0
1154 declare <5 x float> @llvm.copysign.v5f32(<5 x float>, <5 x float>) #0
1156 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }