1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
7 declare i32 @llvm.amdgcn.workitem.id.x() #1
8 declare half @llvm.fabs.f16(half)
9 declare float @llvm.fabs.f32(float)
10 declare double @llvm.fabs.f64(double)
12 ; All nan values are converted to 0xffffffff
13 define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
14 ; SI-LABEL: v_cnd_nan_nosgpr:
16 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
17 ; SI-NEXT: s_load_dword s8, s[0:1], 0xb
18 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
19 ; SI-NEXT: s_mov_b32 s7, 0xf000
20 ; SI-NEXT: s_mov_b32 s2, 0
21 ; SI-NEXT: s_mov_b32 s3, s7
22 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
23 ; SI-NEXT: v_mov_b32_e32 v1, 0
24 ; SI-NEXT: s_waitcnt lgkmcnt(0)
25 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
26 ; SI-NEXT: s_mov_b32 s6, -1
27 ; SI-NEXT: s_cmp_eq_u32 s8, 0
28 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
31 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
34 ; VI-LABEL: v_cnd_nan_nosgpr:
36 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
37 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
38 ; VI-NEXT: s_waitcnt lgkmcnt(0)
39 ; VI-NEXT: v_mov_b32_e32 v1, s3
40 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
41 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
42 ; VI-NEXT: flat_load_dword v0, v[0:1]
43 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
44 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
45 ; VI-NEXT: s_waitcnt lgkmcnt(0)
46 ; VI-NEXT: s_cmp_eq_u32 s2, 0
47 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
48 ; VI-NEXT: s_waitcnt vmcnt(0)
49 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
50 ; VI-NEXT: v_mov_b32_e32 v0, s0
51 ; VI-NEXT: v_mov_b32_e32 v1, s1
52 ; VI-NEXT: flat_store_dword v[0:1], v2
55 ; GFX10-LABEL: v_cnd_nan_nosgpr:
57 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
58 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
60 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
62 ; GFX10-NEXT: s_clause 0x1
63 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
64 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
65 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
66 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX10-NEXT: s_cmp_eq_u32 s4, 0
68 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
69 ; GFX10-NEXT: s_waitcnt vmcnt(0)
70 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
71 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
72 ; GFX10-NEXT: s_endpgm
74 ; GFX11-LABEL: v_cnd_nan_nosgpr:
76 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
77 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
78 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
79 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
80 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
81 ; GFX11-NEXT: s_clause 0x1
82 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
83 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
84 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
85 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0
86 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
87 ; GFX11-NEXT: s_waitcnt vmcnt(0)
88 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
89 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
91 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
92 ; GFX11-NEXT: s_endpgm
93 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
94 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
95 %f = load float, ptr addrspace(1) %f.gep
96 %setcc = icmp ne i32 %c, 0
97 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
98 store float %select, ptr addrspace(1) %out
102 ; This requires slightly trickier SGPR operand legalization since the
103 ; single constant bus SGPR usage is the last operand, and it should
105 ; However on GFX10 constant bus is limited to 2 scalar operands, not one.
106 ; All nan values are converted to 0xffffffff
107 define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 {
108 ; SI-LABEL: v_cnd_nan:
110 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
111 ; SI-NEXT: s_mov_b32 s7, 0xf000
112 ; SI-NEXT: s_mov_b32 s6, -1
113 ; SI-NEXT: s_waitcnt lgkmcnt(0)
114 ; SI-NEXT: s_mov_b32 s4, s0
115 ; SI-NEXT: s_mov_b32 s5, s1
116 ; SI-NEXT: s_cmp_eq_u32 s2, 0
117 ; SI-NEXT: v_mov_b32_e32 v0, s3
118 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
119 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
120 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
123 ; VI-LABEL: v_cnd_nan:
125 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
126 ; VI-NEXT: s_waitcnt lgkmcnt(0)
127 ; VI-NEXT: s_cmp_eq_u32 s2, 0
128 ; VI-NEXT: v_mov_b32_e32 v0, s3
129 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
130 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
131 ; VI-NEXT: v_mov_b32_e32 v0, s0
132 ; VI-NEXT: v_mov_b32_e32 v1, s1
133 ; VI-NEXT: flat_store_dword v[0:1], v2
136 ; GFX10-LABEL: v_cnd_nan:
138 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
139 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
140 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
141 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0
142 ; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0
143 ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
144 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
145 ; GFX10-NEXT: s_endpgm
147 ; GFX11-LABEL: v_cnd_nan:
149 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
150 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
151 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0
153 ; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0
154 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
155 ; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5]
156 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
157 ; GFX11-NEXT: s_nop 0
158 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
159 ; GFX11-NEXT: s_endpgm
160 %setcc = icmp ne i32 %c, 0
161 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
162 store float %select, ptr addrspace(1) %out
166 ; Test different compare and select operand types for optimal code
168 ; (select (cmp (sgprX, constant)), constant, sgprZ)
169 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
170 ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
172 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
173 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13
174 ; SI-NEXT: s_mov_b32 s7, 0xf000
175 ; SI-NEXT: s_mov_b32 s6, 0
176 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
177 ; SI-NEXT: v_mov_b32_e32 v1, 0
178 ; SI-NEXT: s_waitcnt lgkmcnt(0)
179 ; SI-NEXT: v_mov_b32_e32 v2, s1
180 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
181 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
182 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
185 ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
187 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
188 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
189 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
191 ; VI-NEXT: v_mov_b32_e32 v1, s3
192 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
194 ; VI-NEXT: v_mov_b32_e32 v2, s1
195 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
196 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
197 ; VI-NEXT: flat_store_dword v[0:1], v2
200 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
202 ; GFX10-NEXT: s_clause 0x1
203 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
204 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
205 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
206 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
208 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1]
209 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
210 ; GFX10-NEXT: s_endpgm
212 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
214 ; GFX11-NEXT: s_clause 0x1
215 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
216 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
217 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
218 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
219 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
220 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
221 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5]
222 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
223 ; GFX11-NEXT: s_nop 0
224 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
225 ; GFX11-NEXT: s_endpgm
226 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
227 %tid.ext = sext i32 %tid to i64
228 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
229 %setcc = fcmp one float %x, 0.0
230 %select = select i1 %setcc, float 1.0, float %z
231 store float %select, ptr addrspace(1) %out.gep
235 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
236 ; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
238 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
239 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
240 ; SI-NEXT: s_mov_b32 s7, 0xf000
241 ; SI-NEXT: s_mov_b32 s6, 0
242 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
243 ; SI-NEXT: v_mov_b32_e32 v1, 0
244 ; SI-NEXT: s_waitcnt lgkmcnt(0)
245 ; SI-NEXT: v_mov_b32_e32 v2, s0
246 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
247 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
248 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
251 ; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
253 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
254 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
255 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
256 ; VI-NEXT: s_waitcnt lgkmcnt(0)
257 ; VI-NEXT: v_mov_b32_e32 v1, s3
258 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
259 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
260 ; VI-NEXT: v_mov_b32_e32 v2, s0
261 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
262 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
263 ; VI-NEXT: flat_store_dword v[0:1], v2
266 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
268 ; GFX10-NEXT: s_clause 0x1
269 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
270 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
271 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
272 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
273 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
274 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1]
275 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
276 ; GFX10-NEXT: s_endpgm
278 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32:
280 ; GFX11-NEXT: s_clause 0x1
281 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
282 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
283 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
286 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
287 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
288 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
289 ; GFX11-NEXT: s_nop 0
290 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
291 ; GFX11-NEXT: s_endpgm
292 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
293 %tid.ext = sext i32 %tid to i64
294 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
295 %setcc = fcmp one float %x, 0.0
296 %select = select i1 %setcc, float 1.0, float %x
297 store float %select, ptr addrspace(1) %out.gep
301 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 {
302 ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
304 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
305 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x13
306 ; SI-NEXT: s_mov_b32 s7, 0xf000
307 ; SI-NEXT: s_mov_b32 s6, 0
308 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
309 ; SI-NEXT: v_mov_b32_e32 v1, 0
310 ; SI-NEXT: s_waitcnt lgkmcnt(0)
311 ; SI-NEXT: v_mov_b32_e32 v2, s1
312 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
313 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
314 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
317 ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
319 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
320 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
321 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
322 ; VI-NEXT: s_waitcnt lgkmcnt(0)
323 ; VI-NEXT: v_mov_b32_e32 v1, s3
324 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
325 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
326 ; VI-NEXT: v_mov_b32_e32 v2, s1
327 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
328 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
329 ; VI-NEXT: flat_store_dword v[0:1], v2
332 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
334 ; GFX10-NEXT: s_clause 0x1
335 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
336 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
337 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
338 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
339 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
340 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1]
341 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
342 ; GFX10-NEXT: s_endpgm
344 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
346 ; GFX11-NEXT: s_clause 0x1
347 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
348 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
349 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
350 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
352 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
353 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5]
354 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
355 ; GFX11-NEXT: s_nop 0
356 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
357 ; GFX11-NEXT: s_endpgm
358 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
359 %tid.ext = sext i32 %tid to i64
360 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
361 %setcc = fcmp one float %x, 0.0
362 %select = select i1 %setcc, float 0.0, float %z
363 store float %select, ptr addrspace(1) %out.gep
367 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 {
368 ; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
370 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
371 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
372 ; SI-NEXT: s_mov_b32 s7, 0xf000
373 ; SI-NEXT: s_mov_b32 s6, 0
374 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
375 ; SI-NEXT: v_mov_b32_e32 v1, 0
376 ; SI-NEXT: s_waitcnt lgkmcnt(0)
377 ; SI-NEXT: v_mov_b32_e32 v2, s0
378 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
379 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
380 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
383 ; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
385 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
386 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
387 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
388 ; VI-NEXT: s_waitcnt lgkmcnt(0)
389 ; VI-NEXT: v_mov_b32_e32 v1, s3
390 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
391 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
392 ; VI-NEXT: v_mov_b32_e32 v2, s0
393 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
394 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
395 ; VI-NEXT: flat_store_dword v[0:1], v2
398 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
400 ; GFX10-NEXT: s_clause 0x1
401 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
402 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
403 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
404 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
405 ; GFX10-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
406 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1]
407 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
408 ; GFX10-NEXT: s_endpgm
410 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32:
412 ; GFX11-NEXT: s_clause 0x1
413 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
414 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
415 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
416 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
417 ; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
418 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
419 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
420 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
421 ; GFX11-NEXT: s_nop 0
422 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
423 ; GFX11-NEXT: s_endpgm
424 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
425 %tid.ext = sext i32 %tid to i64
426 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
427 %setcc = fcmp one float %x, 0.0
428 %select = select i1 %setcc, float 0.0, float %x
429 store float %select, ptr addrspace(1) %out.gep
433 define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
434 ; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
436 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
437 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
438 ; SI-NEXT: s_mov_b32 s7, 0xf000
439 ; SI-NEXT: s_mov_b32 s6, 0
440 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
441 ; SI-NEXT: v_mov_b32_e32 v1, 0
442 ; SI-NEXT: s_waitcnt lgkmcnt(0)
443 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
444 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
445 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
446 ; SI-NEXT: s_waitcnt vmcnt(0)
447 ; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
448 ; SI-NEXT: s_waitcnt lgkmcnt(0)
449 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
452 ; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
454 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
455 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
456 ; VI-NEXT: s_waitcnt lgkmcnt(0)
457 ; VI-NEXT: v_mov_b32_e32 v1, s3
458 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
459 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
460 ; VI-NEXT: flat_load_dword v3, v[0:1]
461 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
462 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
463 ; VI-NEXT: s_waitcnt lgkmcnt(0)
464 ; VI-NEXT: v_mov_b32_e32 v1, s3
465 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
466 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
467 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
468 ; VI-NEXT: s_waitcnt vmcnt(0)
469 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc
470 ; VI-NEXT: flat_store_dword v[0:1], v2
473 ; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
475 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
476 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
477 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
478 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
479 ; GFX10-NEXT: s_clause 0x1
480 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
481 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
482 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
483 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
484 ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
487 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
488 ; GFX10-NEXT: s_endpgm
490 ; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32:
492 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
493 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
494 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
496 ; GFX11-NEXT: s_clause 0x1
497 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
498 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
499 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
500 ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
501 ; GFX11-NEXT: s_waitcnt vmcnt(0)
502 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
503 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
504 ; GFX11-NEXT: s_nop 0
505 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
506 ; GFX11-NEXT: s_endpgm
507 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
508 %tid.ext = sext i32 %tid to i64
509 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
510 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
511 %z = load float, ptr addrspace(1) %z.gep
512 %setcc = fcmp one float %x, 0.0
513 %select = select i1 %setcc, float 0.0, float %z
514 store float %select, ptr addrspace(1) %out.gep
518 define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 {
519 ; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
521 ; SI-NEXT: s_load_dword s2, s[0:1], 0xb
522 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
523 ; SI-NEXT: s_mov_b32 s7, 0xf000
524 ; SI-NEXT: s_mov_b32 s6, 0
525 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
526 ; SI-NEXT: v_mov_b32_e32 v1, 0
527 ; SI-NEXT: s_waitcnt lgkmcnt(0)
528 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
529 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
530 ; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
531 ; SI-NEXT: s_waitcnt vmcnt(0)
532 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
533 ; SI-NEXT: s_waitcnt lgkmcnt(0)
534 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
537 ; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
539 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
541 ; VI-NEXT: s_waitcnt lgkmcnt(0)
542 ; VI-NEXT: v_mov_b32_e32 v1, s3
543 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
545 ; VI-NEXT: flat_load_dword v3, v[0:1]
546 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
547 ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
548 ; VI-NEXT: s_waitcnt lgkmcnt(0)
549 ; VI-NEXT: v_mov_b32_e32 v1, s3
550 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
551 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
552 ; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
553 ; VI-NEXT: s_waitcnt vmcnt(0)
554 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
555 ; VI-NEXT: flat_store_dword v[0:1], v2
558 ; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
560 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
561 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
562 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
563 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
564 ; GFX10-NEXT: s_clause 0x1
565 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
566 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
567 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
568 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
569 ; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
570 ; GFX10-NEXT: s_waitcnt vmcnt(0)
571 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
572 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
573 ; GFX10-NEXT: s_endpgm
575 ; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32:
577 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
578 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
579 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
581 ; GFX11-NEXT: s_clause 0x1
582 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
583 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
584 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
585 ; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
586 ; GFX11-NEXT: s_waitcnt vmcnt(0)
587 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
588 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
589 ; GFX11-NEXT: s_nop 0
590 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
591 ; GFX11-NEXT: s_endpgm
592 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
593 %tid.ext = sext i32 %tid to i64
594 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
595 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
596 %z = load float, ptr addrspace(1) %z.gep
597 %setcc = fcmp one float %x, 0.0
598 %select = select i1 %setcc, float 1.0, float %z
599 store float %select, ptr addrspace(1) %out.gep
603 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 {
604 ; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
606 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
607 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
608 ; SI-NEXT: s_mov_b32 s3, 0xf000
609 ; SI-NEXT: s_mov_b32 s2, 0
610 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
611 ; SI-NEXT: v_mov_b32_e32 v1, 0
612 ; SI-NEXT: s_waitcnt lgkmcnt(0)
613 ; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
614 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
615 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
616 ; SI-NEXT: v_mov_b32_e32 v3, s8
617 ; SI-NEXT: s_waitcnt vmcnt(0)
618 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2
619 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
620 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
623 ; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
625 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
626 ; VI-NEXT: s_load_dword s0, s[0:1], 0x34
627 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
628 ; VI-NEXT: s_waitcnt lgkmcnt(0)
629 ; VI-NEXT: v_mov_b32_e32 v1, s7
630 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
631 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
632 ; VI-NEXT: flat_load_dword v3, v[0:1]
633 ; VI-NEXT: v_mov_b32_e32 v1, s5
634 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
635 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
636 ; VI-NEXT: v_mov_b32_e32 v4, s0
637 ; VI-NEXT: s_waitcnt vmcnt(0)
638 ; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3
639 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc
640 ; VI-NEXT: flat_store_dword v[0:1], v2
643 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
645 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
646 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
647 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
648 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
649 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
650 ; GFX10-NEXT: s_waitcnt vmcnt(0)
651 ; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
652 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc
653 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
654 ; GFX10-NEXT: s_endpgm
656 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32:
658 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
659 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
660 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34
661 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
662 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
663 ; GFX11-NEXT: s_waitcnt vmcnt(0)
664 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1
665 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s0, vcc
666 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
667 ; GFX11-NEXT: s_nop 0
668 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
669 ; GFX11-NEXT: s_endpgm
670 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
671 %tid.ext = sext i32 %tid to i64
672 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
673 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
674 %x = load float, ptr addrspace(1) %x.gep
675 %setcc = fcmp olt float %x, 0.0
676 %select = select i1 %setcc, float 1.0, float %z
677 store float %select, ptr addrspace(1) %out.gep
681 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
682 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
684 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
685 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
686 ; SI-NEXT: s_mov_b32 s11, 0xf000
687 ; SI-NEXT: s_mov_b32 s10, 0
688 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
689 ; SI-NEXT: v_mov_b32_e32 v1, 0
690 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
691 ; SI-NEXT: s_waitcnt lgkmcnt(0)
692 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
693 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
694 ; SI-NEXT: s_waitcnt vmcnt(0)
695 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
696 ; SI-NEXT: s_waitcnt vmcnt(0)
697 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
698 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
699 ; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
700 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
703 ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
705 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
706 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
707 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
708 ; VI-NEXT: s_waitcnt lgkmcnt(0)
709 ; VI-NEXT: v_mov_b32_e32 v1, s7
710 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
711 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
712 ; VI-NEXT: v_mov_b32_e32 v3, s1
713 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
714 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
715 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
716 ; VI-NEXT: s_waitcnt vmcnt(0)
717 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
718 ; VI-NEXT: s_waitcnt vmcnt(0)
719 ; VI-NEXT: v_mov_b32_e32 v1, s5
720 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
721 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
722 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5
723 ; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
724 ; VI-NEXT: flat_store_dword v[0:1], v2
727 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
729 ; GFX10-NEXT: s_clause 0x1
730 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
731 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
732 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
733 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
735 ; GFX10-NEXT: s_waitcnt vmcnt(0)
736 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
737 ; GFX10-NEXT: s_waitcnt vmcnt(0)
738 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
739 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
740 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
741 ; GFX10-NEXT: s_endpgm
743 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
745 ; GFX11-NEXT: s_clause 0x1
746 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
747 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
748 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
749 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
750 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
751 ; GFX11-NEXT: s_waitcnt vmcnt(0)
752 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
753 ; GFX11-NEXT: s_waitcnt vmcnt(0)
754 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
755 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
756 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
757 ; GFX11-NEXT: s_nop 0
758 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
759 ; GFX11-NEXT: s_endpgm
760 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
761 %tid.ext = sext i32 %tid to i64
762 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
763 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
764 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
765 %x = load volatile float, ptr addrspace(1) %x.gep
766 %z = load volatile float, ptr addrspace(1) %z.gep
767 %setcc = fcmp ult float %x, 0.0
768 %select = select i1 %setcc, float 1.0, float %z
769 store float %select, ptr addrspace(1) %out.gep
773 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
774 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
776 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
777 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
778 ; SI-NEXT: s_mov_b32 s11, 0xf000
779 ; SI-NEXT: s_mov_b32 s10, 0
780 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
781 ; SI-NEXT: v_mov_b32_e32 v1, 0
782 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
783 ; SI-NEXT: s_waitcnt lgkmcnt(0)
784 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
785 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
786 ; SI-NEXT: s_waitcnt vmcnt(0)
787 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
788 ; SI-NEXT: s_waitcnt vmcnt(0)
789 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
790 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
791 ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc
792 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
795 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
797 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
798 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
799 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
800 ; VI-NEXT: s_waitcnt lgkmcnt(0)
801 ; VI-NEXT: v_mov_b32_e32 v1, s7
802 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
803 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
804 ; VI-NEXT: v_mov_b32_e32 v3, s1
805 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
806 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
807 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
808 ; VI-NEXT: s_waitcnt vmcnt(0)
809 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
810 ; VI-NEXT: s_waitcnt vmcnt(0)
811 ; VI-NEXT: v_mov_b32_e32 v1, s5
812 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
813 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
814 ; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
815 ; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc
816 ; VI-NEXT: flat_store_dword v[0:1], v2
819 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
821 ; GFX10-NEXT: s_clause 0x1
822 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
823 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
824 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
825 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
826 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
827 ; GFX10-NEXT: s_waitcnt vmcnt(0)
828 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
829 ; GFX10-NEXT: s_waitcnt vmcnt(0)
830 ; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
831 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
832 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
833 ; GFX10-NEXT: s_endpgm
835 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
837 ; GFX11-NEXT: s_clause 0x1
838 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
839 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
840 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
841 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
842 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
843 ; GFX11-NEXT: s_waitcnt vmcnt(0)
844 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
845 ; GFX11-NEXT: s_waitcnt vmcnt(0)
846 ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
847 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
848 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
849 ; GFX11-NEXT: s_nop 0
850 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
851 ; GFX11-NEXT: s_endpgm
852 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
853 %tid.ext = sext i32 %tid to i64
854 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
855 %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext
856 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext
857 %x = load volatile i32, ptr addrspace(1) %x.gep
858 %z = load volatile i32, ptr addrspace(1) %z.gep
859 %setcc = icmp slt i32 %x, 0
860 %select = select i1 %setcc, i32 2, i32 %z
861 store i32 %select, ptr addrspace(1) %out.gep
865 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
866 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
868 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
869 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
870 ; SI-NEXT: s_mov_b32 s11, 0xf000
871 ; SI-NEXT: s_mov_b32 s10, 0
872 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
873 ; SI-NEXT: v_mov_b32_e32 v1, 0
874 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
875 ; SI-NEXT: s_waitcnt lgkmcnt(0)
876 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
877 ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc
878 ; SI-NEXT: s_waitcnt vmcnt(0)
879 ; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 glc
880 ; SI-NEXT: s_waitcnt vmcnt(0)
881 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
882 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
883 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
884 ; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
885 ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
888 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
890 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
891 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
892 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
893 ; VI-NEXT: s_waitcnt lgkmcnt(0)
894 ; VI-NEXT: v_mov_b32_e32 v1, s7
895 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
896 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
897 ; VI-NEXT: v_mov_b32_e32 v3, s1
898 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
899 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
900 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
901 ; VI-NEXT: s_waitcnt vmcnt(0)
902 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
903 ; VI-NEXT: s_waitcnt vmcnt(0)
904 ; VI-NEXT: v_mov_b32_e32 v5, s5
905 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
906 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
907 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
908 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
909 ; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
910 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
913 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
915 ; GFX10-NEXT: s_clause 0x1
916 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
917 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
918 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
919 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
920 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] glc dlc
921 ; GFX10-NEXT: s_waitcnt vmcnt(0)
922 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] glc dlc
923 ; GFX10-NEXT: s_waitcnt vmcnt(0)
924 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
925 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
926 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
927 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
928 ; GFX10-NEXT: s_endpgm
930 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
932 ; GFX11-NEXT: s_clause 0x1
933 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
934 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
935 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
936 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
937 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc
938 ; GFX11-NEXT: s_waitcnt vmcnt(0)
939 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc
940 ; GFX11-NEXT: s_waitcnt vmcnt(0)
941 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
942 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
943 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
944 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
945 ; GFX11-NEXT: s_nop 0
946 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
947 ; GFX11-NEXT: s_endpgm
948 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
949 %tid.ext = sext i32 %tid to i64
950 %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext
951 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
952 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
953 %x = load volatile i64, ptr addrspace(1) %x.gep
954 %z = load volatile i64, ptr addrspace(1) %z.gep
955 %setcc = icmp slt i64 %x, 0
956 %select = select i1 %setcc, i64 2, i64 %z
957 store i64 %select, ptr addrspace(1) %out.gep
961 define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
962 ; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
964 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
965 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
966 ; SI-NEXT: s_mov_b32 s11, 0xf000
967 ; SI-NEXT: s_mov_b32 s10, 0
968 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
969 ; SI-NEXT: v_mov_b32_e32 v2, 0
970 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
971 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
972 ; SI-NEXT: v_mov_b32_e32 v5, v2
973 ; SI-NEXT: s_waitcnt lgkmcnt(0)
974 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
975 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
976 ; SI-NEXT: s_waitcnt vmcnt(0)
977 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
978 ; SI-NEXT: s_waitcnt vmcnt(0)
979 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
980 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
981 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
982 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
983 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
984 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
985 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
988 ; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
990 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
991 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
992 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
993 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
994 ; VI-NEXT: s_waitcnt lgkmcnt(0)
995 ; VI-NEXT: v_mov_b32_e32 v2, s7
996 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
997 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
998 ; VI-NEXT: v_mov_b32_e32 v0, s1
999 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
1000 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1001 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1002 ; VI-NEXT: s_waitcnt vmcnt(0)
1003 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
1004 ; VI-NEXT: s_waitcnt vmcnt(0)
1005 ; VI-NEXT: v_mov_b32_e32 v7, s5
1006 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
1007 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1008 ; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
1009 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1010 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1011 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1012 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1013 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1016 ; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1018 ; GFX10-NEXT: s_clause 0x1
1019 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1020 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1021 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1022 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1023 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1024 ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
1025 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1026 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
1027 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1028 ; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6
1029 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1030 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1031 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1032 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1033 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
1034 ; GFX10-NEXT: s_endpgm
1036 ; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
1038 ; GFX11-NEXT: s_clause 0x1
1039 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1040 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1041 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1042 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1043 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1044 ; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
1045 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1046 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
1047 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1048 ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
1049 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1050 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1051 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1052 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1053 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
1054 ; GFX11-NEXT: s_nop 0
1055 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1056 ; GFX11-NEXT: s_endpgm
1057 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1058 %tid.ext = sext i32 %tid to i64
1059 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1060 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1061 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1062 %x = load volatile float, ptr addrspace(1) %x.gep
1063 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1064 %setcc = fcmp ugt float %x, 4.0
1065 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>
1066 store <4 x float> %select, ptr addrspace(1) %out.gep
1070 define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1071 ; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1073 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1074 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1075 ; SI-NEXT: s_mov_b32 s11, 0xf000
1076 ; SI-NEXT: s_mov_b32 s10, 0
1077 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1078 ; SI-NEXT: v_mov_b32_e32 v2, 0
1079 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1080 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1081 ; SI-NEXT: v_mov_b32_e32 v5, v2
1082 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1083 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1084 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1085 ; SI-NEXT: s_waitcnt vmcnt(0)
1086 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
1087 ; SI-NEXT: s_waitcnt vmcnt(0)
1088 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1089 ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1090 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1091 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1092 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1093 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1094 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
1097 ; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1099 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1100 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1101 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1102 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1103 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1104 ; VI-NEXT: v_mov_b32_e32 v2, s7
1105 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1106 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1107 ; VI-NEXT: v_mov_b32_e32 v0, s1
1108 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
1109 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1110 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1111 ; VI-NEXT: s_waitcnt vmcnt(0)
1112 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
1113 ; VI-NEXT: s_waitcnt vmcnt(0)
1114 ; VI-NEXT: v_mov_b32_e32 v7, s5
1115 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
1116 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1117 ; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1118 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1119 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1120 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1121 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1122 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1125 ; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1127 ; GFX10-NEXT: s_clause 0x1
1128 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1129 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1130 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1131 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1132 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1133 ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
1134 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1135 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
1136 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1137 ; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6
1138 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1139 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1140 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1141 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1142 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
1143 ; GFX10-NEXT: s_endpgm
1145 ; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
1147 ; GFX11-NEXT: s_clause 0x1
1148 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1149 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1150 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1151 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1152 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1153 ; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
1154 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1155 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
1156 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1157 ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
1158 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1159 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1160 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1161 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1162 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
1163 ; GFX11-NEXT: s_nop 0
1164 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1165 ; GFX11-NEXT: s_endpgm
1166 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1167 %tid.ext = sext i32 %tid to i64
1168 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1169 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1170 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1171 %x = load volatile float, ptr addrspace(1) %x.gep
1172 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1173 %setcc = fcmp ugt float %x, 4.0
1174 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1175 store <4 x float> %select, ptr addrspace(1) %out.gep
1179 ; This must be swapped as a vector type before the condition has
1181 define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1182 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1184 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1185 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1186 ; SI-NEXT: s_mov_b32 s11, 0xf000
1187 ; SI-NEXT: s_mov_b32 s10, 0
1188 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1189 ; SI-NEXT: v_mov_b32_e32 v2, 0
1190 ; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1191 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1192 ; SI-NEXT: v_mov_b32_e32 v5, v2
1193 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1194 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1195 ; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc
1196 ; SI-NEXT: s_waitcnt vmcnt(0)
1197 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 glc
1198 ; SI-NEXT: s_waitcnt vmcnt(0)
1199 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1200 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1201 ; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1202 ; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1203 ; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1204 ; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1205 ; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
1208 ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1210 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1211 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1212 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1213 ; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1214 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1215 ; VI-NEXT: v_mov_b32_e32 v2, s7
1216 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1217 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1218 ; VI-NEXT: v_mov_b32_e32 v0, s1
1219 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
1220 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1221 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1222 ; VI-NEXT: s_waitcnt vmcnt(0)
1223 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc
1224 ; VI-NEXT: s_waitcnt vmcnt(0)
1225 ; VI-NEXT: v_mov_b32_e32 v7, s5
1226 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v5
1227 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
1228 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1229 ; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1230 ; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1231 ; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1232 ; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1233 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
1236 ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1238 ; GFX10-NEXT: s_clause 0x1
1239 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1240 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1241 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1242 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0
1243 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1244 ; GFX10-NEXT: global_load_dword v6, v4, s[6:7] glc dlc
1245 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1246 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] glc dlc
1247 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1248 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6
1249 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1250 ; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1251 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1252 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1253 ; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
1254 ; GFX10-NEXT: s_endpgm
1256 ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
1258 ; GFX11-NEXT: s_clause 0x1
1259 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1260 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1261 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1262 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
1263 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1264 ; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
1265 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1266 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
1267 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1268 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
1269 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
1270 ; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc
1271 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc
1272 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
1273 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
1274 ; GFX11-NEXT: s_nop 0
1275 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1276 ; GFX11-NEXT: s_endpgm
1277 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1278 %tid.ext = sext i32 %tid to i64
1279 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1280 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext
1281 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext
1282 %x = load volatile float, ptr addrspace(1) %x.gep
1283 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep
1284 %setcc = fcmp ugt float 4.0, %x
1285 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z
1286 store <4 x float> %select, ptr addrspace(1) %out.gep
1290 define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1291 ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1293 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1294 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1295 ; SI-NEXT: s_mov_b32 s10, 0
1296 ; SI-NEXT: v_mov_b32_e32 v1, 0
1297 ; SI-NEXT: s_mov_b32 s11, 0xf000
1298 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1299 ; SI-NEXT: v_mov_b32_e32 v3, v1
1300 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1301 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1302 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1303 ; SI-NEXT: buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc
1304 ; SI-NEXT: s_waitcnt vmcnt(0)
1305 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc
1306 ; SI-NEXT: s_waitcnt vmcnt(0)
1307 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1308 ; SI-NEXT: v_and_b32_e32 v3, 1, v3
1309 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1310 ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
1311 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1312 ; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1313 ; SI-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
1316 ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1318 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1319 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1320 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1321 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1322 ; VI-NEXT: v_mov_b32_e32 v2, s7
1323 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1324 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1325 ; VI-NEXT: v_mov_b32_e32 v4, s1
1326 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
1327 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
1328 ; VI-NEXT: flat_load_dword v2, v[1:2] glc
1329 ; VI-NEXT: s_waitcnt vmcnt(0)
1330 ; VI-NEXT: flat_load_ubyte v3, v[3:4] glc
1331 ; VI-NEXT: s_waitcnt vmcnt(0)
1332 ; VI-NEXT: v_mov_b32_e32 v1, s5
1333 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
1334 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1335 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1336 ; VI-NEXT: v_and_b32_e32 v3, 1, v3
1337 ; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3
1338 ; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1339 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
1340 ; VI-NEXT: flat_store_byte v[0:1], v2
1343 ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1345 ; GFX10-NEXT: s_clause 0x1
1346 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1347 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1348 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1349 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1350 ; GFX10-NEXT: global_load_dword v2, v1, s[6:7] glc dlc
1351 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1352 ; GFX10-NEXT: global_load_ubyte v3, v0, s[2:3] glc dlc
1353 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1354 ; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
1355 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v3
1356 ; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1
1357 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1358 ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1359 ; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
1360 ; GFX10-NEXT: s_endpgm
1362 ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
1364 ; GFX11-NEXT: s_clause 0x1
1365 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1366 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1367 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1368 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1369 ; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc
1370 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1371 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
1372 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1373 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
1374 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
1375 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1376 ; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
1377 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1378 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1379 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
1380 ; GFX11-NEXT: global_store_b8 v0, v1, s[4:5]
1381 ; GFX11-NEXT: s_nop 0
1382 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1383 ; GFX11-NEXT: s_endpgm
1384 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1385 %tid.ext = sext i32 %tid to i64
1386 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1387 %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext
1388 %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext
1389 %x = load volatile i32, ptr addrspace(1) %x.gep
1390 %z = load volatile i1, ptr addrspace(1) %z.gep
1391 %setcc = icmp slt i32 %x, 0
1392 %select = select i1 %setcc, i1 true, i1 %z
1393 store i1 %select, ptr addrspace(1) %out.gep
1397 ; Different types compared vs. selected
1398 define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1399 ; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1401 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1402 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1403 ; SI-NEXT: s_mov_b32 s11, 0xf000
1404 ; SI-NEXT: s_mov_b32 s10, 0
1405 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1406 ; SI-NEXT: v_mov_b32_e32 v2, 0
1407 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1408 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1409 ; SI-NEXT: v_mov_b32_e32 v4, v2
1410 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1411 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1412 ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1413 ; SI-NEXT: s_waitcnt vmcnt(0)
1414 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
1415 ; SI-NEXT: s_waitcnt vmcnt(0)
1416 ; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
1417 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1418 ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2
1419 ; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
1420 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1421 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
1424 ; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1426 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1427 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1428 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1429 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
1430 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1431 ; VI-NEXT: v_mov_b32_e32 v2, s7
1432 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1433 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1434 ; VI-NEXT: v_mov_b32_e32 v0, s1
1435 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
1436 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1437 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1438 ; VI-NEXT: s_waitcnt vmcnt(0)
1439 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
1440 ; VI-NEXT: s_waitcnt vmcnt(0)
1441 ; VI-NEXT: v_mov_b32_e32 v3, s5
1442 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5
1443 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1444 ; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000
1445 ; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6
1446 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
1447 ; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1448 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1451 ; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1453 ; GFX10-NEXT: s_clause 0x1
1454 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1455 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1456 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1457 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1458 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1459 ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc
1460 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1461 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc
1462 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1463 ; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4
1464 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1465 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1466 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
1467 ; GFX10-NEXT: s_endpgm
1469 ; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
1471 ; GFX11-NEXT: s_clause 0x1
1472 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1473 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1474 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1475 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1476 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1477 ; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
1478 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1479 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
1480 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1481 ; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
1482 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
1483 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
1484 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1485 ; GFX11-NEXT: s_nop 0
1486 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1487 ; GFX11-NEXT: s_endpgm
1488 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1489 %tid.ext = sext i32 %tid to i64
1490 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1491 %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext
1492 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext
1493 %x = load volatile float, ptr addrspace(1) %x.gep
1494 %z = load volatile double, ptr addrspace(1) %z.gep
1495 %setcc = fcmp ult float %x, 0.0
1496 %select = select i1 %setcc, double 1.0, double %z
1497 store double %select, ptr addrspace(1) %out.gep
1501 ; Different types compared vs. selected
1502 define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1503 ; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1505 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1506 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1507 ; SI-NEXT: s_mov_b32 s11, 0xf000
1508 ; SI-NEXT: s_mov_b32 s10, 0
1509 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1510 ; SI-NEXT: v_mov_b32_e32 v2, 0
1511 ; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1512 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1513 ; SI-NEXT: v_mov_b32_e32 v4, v2
1514 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1515 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1516 ; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc
1517 ; SI-NEXT: s_waitcnt vmcnt(0)
1518 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 glc
1519 ; SI-NEXT: s_waitcnt vmcnt(0)
1520 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1521 ; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
1522 ; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1523 ; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1524 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64
1527 ; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1529 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1530 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1531 ; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1532 ; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
1533 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1534 ; VI-NEXT: v_mov_b32_e32 v2, s7
1535 ; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1536 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
1537 ; VI-NEXT: v_mov_b32_e32 v0, s1
1538 ; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
1539 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
1540 ; VI-NEXT: flat_load_dword v6, v[1:2] glc
1541 ; VI-NEXT: s_waitcnt vmcnt(0)
1542 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc
1543 ; VI-NEXT: s_waitcnt vmcnt(0)
1544 ; VI-NEXT: v_mov_b32_e32 v3, s5
1545 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v5
1546 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1547 ; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6
1548 ; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1549 ; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1550 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1553 ; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1555 ; GFX10-NEXT: s_clause 0x1
1556 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1557 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1558 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
1559 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
1560 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1561 ; GFX10-NEXT: global_load_dword v4, v2, s[6:7] glc dlc
1562 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1563 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] glc dlc
1564 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1565 ; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
1566 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1567 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1568 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
1569 ; GFX10-NEXT: s_endpgm
1571 ; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
1573 ; GFX11-NEXT: s_clause 0x1
1574 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1575 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1576 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
1577 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
1578 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1579 ; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
1580 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1581 ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
1582 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1583 ; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
1584 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1585 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc
1586 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
1587 ; GFX11-NEXT: s_nop 0
1588 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1589 ; GFX11-NEXT: s_endpgm
1590 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1591 %tid.ext = sext i32 %tid to i64
1592 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1593 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext
1594 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext
1595 %x = load volatile float, ptr addrspace(1) %x.gep
1596 %z = load volatile i64, ptr addrspace(1) %z.gep
1597 %setcc = fcmp one float %x, 0.0
1598 %select = select i1 %setcc, i64 3, i64 %z
1599 store i64 %select, ptr addrspace(1) %out.gep
1603 ; Different types compared vs. selected
1604 define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1605 ; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1607 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1608 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1609 ; SI-NEXT: s_mov_b32 s11, 0xf000
1610 ; SI-NEXT: s_mov_b32 s10, 0
1611 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1612 ; SI-NEXT: v_mov_b32_e32 v1, 0
1613 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1614 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1615 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1616 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
1617 ; SI-NEXT: s_waitcnt vmcnt(0)
1618 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
1619 ; SI-NEXT: s_waitcnt vmcnt(0)
1620 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1621 ; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2
1622 ; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc
1623 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
1626 ; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1628 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1629 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1630 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1631 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1632 ; VI-NEXT: v_mov_b32_e32 v1, s7
1633 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
1634 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1635 ; VI-NEXT: v_mov_b32_e32 v3, s1
1636 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
1637 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1638 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
1639 ; VI-NEXT: s_waitcnt vmcnt(0)
1640 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
1641 ; VI-NEXT: s_waitcnt vmcnt(0)
1642 ; VI-NEXT: v_mov_b32_e32 v1, s5
1643 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
1644 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1645 ; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5
1646 ; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc
1647 ; VI-NEXT: flat_store_dword v[0:1], v2
1650 ; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1652 ; GFX10-NEXT: s_clause 0x1
1653 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1654 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1655 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1656 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1657 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
1658 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1659 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
1660 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1661 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
1662 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
1663 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1664 ; GFX10-NEXT: s_endpgm
1666 ; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
1668 ; GFX11-NEXT: s_clause 0x1
1669 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1670 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1671 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1672 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1673 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
1674 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1675 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
1676 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1677 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
1678 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
1679 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1680 ; GFX11-NEXT: s_nop 0
1681 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1682 ; GFX11-NEXT: s_endpgm
1683 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1684 %tid.ext = sext i32 %tid to i64
1685 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext
1686 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
1687 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1688 %x = load volatile i32, ptr addrspace(1) %x.gep
1689 %z = load volatile float, ptr addrspace(1) %z.gep
1690 %setcc = icmp ugt i32 %x, 1
1691 %select = select i1 %setcc, float 4.0, float %z
1692 store float %select, ptr addrspace(1) %out.gep
1696 ; FIXME: Should be able to handle multiple uses
1697 define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 {
1698 ; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1700 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1701 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1702 ; SI-NEXT: s_mov_b32 s11, 0xf000
1703 ; SI-NEXT: s_mov_b32 s10, 0
1704 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1705 ; SI-NEXT: v_mov_b32_e32 v1, 0
1706 ; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
1707 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1708 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
1709 ; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
1710 ; SI-NEXT: s_waitcnt vmcnt(0)
1711 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc
1712 ; SI-NEXT: s_waitcnt vmcnt(0)
1713 ; SI-NEXT: s_mov_b64 s[6:7], s[10:11]
1714 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2
1715 ; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc
1716 ; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc
1717 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
1718 ; SI-NEXT: s_waitcnt vmcnt(0)
1719 ; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
1720 ; SI-NEXT: s_waitcnt vmcnt(0)
1723 ; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1725 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1726 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1727 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1728 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1729 ; VI-NEXT: v_mov_b32_e32 v1, s7
1730 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
1731 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1732 ; VI-NEXT: v_mov_b32_e32 v3, s1
1733 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
1734 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1735 ; VI-NEXT: flat_load_dword v5, v[0:1] glc
1736 ; VI-NEXT: s_waitcnt vmcnt(0)
1737 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
1738 ; VI-NEXT: s_waitcnt vmcnt(0)
1739 ; VI-NEXT: v_mov_b32_e32 v1, s5
1740 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
1741 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1742 ; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5
1743 ; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc
1744 ; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1745 ; VI-NEXT: flat_store_dword v[0:1], v3
1746 ; VI-NEXT: s_waitcnt vmcnt(0)
1747 ; VI-NEXT: flat_store_dword v[0:1], v2
1748 ; VI-NEXT: s_waitcnt vmcnt(0)
1751 ; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1753 ; GFX10-NEXT: s_clause 0x1
1754 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1755 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1756 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1757 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1758 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
1759 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1760 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
1761 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1762 ; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
1763 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
1764 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1765 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
1766 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1767 ; GFX10-NEXT: global_store_dword v0, v2, s[4:5]
1768 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1769 ; GFX10-NEXT: s_endpgm
1771 ; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
1773 ; GFX11-NEXT: s_clause 0x1
1774 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1775 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1776 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1777 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1778 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
1779 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1780 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
1781 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1782 ; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
1783 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
1784 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc
1785 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
1786 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1787 ; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc
1788 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1789 ; GFX11-NEXT: s_nop 0
1790 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1791 ; GFX11-NEXT: s_endpgm
1792 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1793 %tid.ext = sext i32 %tid to i64
1794 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext
1795 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext
1796 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
1797 %x = load volatile float, ptr addrspace(1) %x.gep
1798 %z = load volatile float, ptr addrspace(1) %z.gep
1799 %setcc = fcmp ugt float 4.0, %x
1800 %select0 = select i1 %setcc, float -1.0, float %z
1801 %select1 = select i1 %setcc, float -2.0, float %z
1802 store volatile float %select0, ptr addrspace(1) %out.gep
1803 store volatile float %select1, ptr addrspace(1) %out.gep
1807 ; Source modifiers abs/neg only work for f32
1808 define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
1809 ; SI-LABEL: v_cndmask_abs_neg_f16:
1811 ; SI-NEXT: s_load_dword s8, s[0:1], 0xb
1812 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1813 ; SI-NEXT: s_mov_b32 s3, 0xf000
1814 ; SI-NEXT: s_mov_b32 s6, 0
1815 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1816 ; SI-NEXT: v_mov_b32_e32 v1, 0
1817 ; SI-NEXT: s_mov_b32 s7, s3
1818 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1819 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1820 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1821 ; SI-NEXT: s_mov_b32 s2, -1
1822 ; SI-NEXT: s_cmp_lg_u32 s8, 0
1823 ; SI-NEXT: s_waitcnt vmcnt(0)
1824 ; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0|
1825 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
1826 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
1827 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1828 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
1829 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1830 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1833 ; VI-LABEL: v_cndmask_abs_neg_f16:
1835 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1836 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1837 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1838 ; VI-NEXT: v_mov_b32_e32 v1, s3
1839 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1840 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1841 ; VI-NEXT: flat_load_ushort v0, v[0:1]
1842 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1843 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1844 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1845 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1846 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
1847 ; VI-NEXT: s_waitcnt vmcnt(0)
1848 ; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1849 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1850 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
1851 ; VI-NEXT: v_mov_b32_e32 v0, s0
1852 ; VI-NEXT: v_mov_b32_e32 v1, s1
1853 ; VI-NEXT: flat_store_short v[0:1], v2
1856 ; GFX10-LABEL: v_cndmask_abs_neg_f16:
1858 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1859 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1860 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1861 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1862 ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
1863 ; GFX10-NEXT: s_clause 0x1
1864 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1865 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1866 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1867 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1868 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0
1869 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
1870 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1871 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1872 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1873 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1874 ; GFX10-NEXT: global_store_short v2, v0, s[2:3]
1875 ; GFX10-NEXT: s_endpgm
1877 ; GFX11-LABEL: v_cndmask_abs_neg_f16:
1879 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
1880 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1881 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1882 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1883 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
1884 ; GFX11-NEXT: s_clause 0x1
1885 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1886 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1887 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1888 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1889 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
1890 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1891 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
1892 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
1893 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1894 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1895 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
1896 ; GFX11-NEXT: s_nop 0
1897 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1898 ; GFX11-NEXT: s_endpgm
1899 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
1900 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx
1901 %f = load half, ptr addrspace(1) %f.gep
1902 %f.abs = call half @llvm.fabs.f16(half %f)
1903 %f.neg = fneg half %f
1904 %setcc = icmp ne i32 %c, 0
1905 %select = select i1 %setcc, half %f.abs, half %f.neg
1906 store half %select, ptr addrspace(1) %out
1910 define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
1911 ; SI-LABEL: v_cndmask_abs_neg_f32:
1913 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1914 ; SI-NEXT: s_load_dword s8, s[0:1], 0xb
1915 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1916 ; SI-NEXT: s_mov_b32 s7, 0xf000
1917 ; SI-NEXT: s_mov_b32 s2, 0
1918 ; SI-NEXT: s_mov_b32 s3, s7
1919 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1920 ; SI-NEXT: v_mov_b32_e32 v1, 0
1921 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1922 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1923 ; SI-NEXT: s_mov_b32 s6, -1
1924 ; SI-NEXT: s_cmp_lg_u32 s8, 0
1925 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
1926 ; SI-NEXT: s_waitcnt vmcnt(0)
1927 ; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
1928 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1931 ; VI-LABEL: v_cndmask_abs_neg_f32:
1933 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1934 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1935 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1936 ; VI-NEXT: v_mov_b32_e32 v1, s3
1937 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
1938 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1939 ; VI-NEXT: flat_load_dword v0, v[0:1]
1940 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
1941 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1942 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1943 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1944 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
1945 ; VI-NEXT: s_waitcnt vmcnt(0)
1946 ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
1947 ; VI-NEXT: v_mov_b32_e32 v0, s0
1948 ; VI-NEXT: v_mov_b32_e32 v1, s1
1949 ; VI-NEXT: flat_store_dword v[0:1], v2
1952 ; GFX10-LABEL: v_cndmask_abs_neg_f32:
1954 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1955 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1956 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1957 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1959 ; GFX10-NEXT: s_clause 0x1
1960 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
1961 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1962 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1963 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1964 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0
1965 ; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0
1966 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1967 ; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
1968 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
1969 ; GFX10-NEXT: s_endpgm
1971 ; GFX11-LABEL: v_cndmask_abs_neg_f32:
1973 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
1974 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1975 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1976 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1977 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1978 ; GFX11-NEXT: s_clause 0x1
1979 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
1980 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1981 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1982 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
1983 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
1984 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1985 ; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
1986 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1987 ; GFX11-NEXT: s_nop 0
1988 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1989 ; GFX11-NEXT: s_endpgm
1990 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
1991 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx
1992 %f = load float, ptr addrspace(1) %f.gep
1993 %f.abs = call float @llvm.fabs.f32(float %f)
1994 %f.neg = fneg float %f
1995 %setcc = icmp ne i32 %c, 0
1996 %select = select i1 %setcc, float %f.abs, float %f.neg
1997 store float %select, ptr addrspace(1) %out
2001 define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 {
2002 ; SI-LABEL: v_cndmask_abs_neg_f64:
2004 ; SI-NEXT: s_load_dword s8, s[0:1], 0xb
2005 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2006 ; SI-NEXT: s_mov_b32 s3, 0xf000
2007 ; SI-NEXT: s_mov_b32 s6, 0
2008 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2009 ; SI-NEXT: v_mov_b32_e32 v1, 0
2010 ; SI-NEXT: s_mov_b32 s7, s3
2011 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2012 ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2013 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2014 ; SI-NEXT: s_mov_b32 s2, -1
2015 ; SI-NEXT: s_cmp_lg_u32 s8, 0
2016 ; SI-NEXT: s_waitcnt vmcnt(0)
2017 ; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2018 ; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2019 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
2020 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2021 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2022 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2023 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2026 ; VI-LABEL: v_cndmask_abs_neg_f64:
2028 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2029 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2030 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2031 ; VI-NEXT: v_mov_b32_e32 v1, s3
2032 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
2033 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2034 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2035 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
2036 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2037 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2038 ; VI-NEXT: s_cmp_lg_u32 s2, 0
2039 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
2040 ; VI-NEXT: s_waitcnt vmcnt(0)
2041 ; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2042 ; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2043 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2044 ; VI-NEXT: v_mov_b32_e32 v3, s1
2045 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2046 ; VI-NEXT: v_mov_b32_e32 v2, s0
2047 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2050 ; GFX10-LABEL: v_cndmask_abs_neg_f64:
2052 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2053 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2054 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2055 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2056 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
2057 ; GFX10-NEXT: s_clause 0x1
2058 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
2059 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2060 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2061 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2062 ; GFX10-NEXT: s_cmp_lg_u32 s4, 0
2063 ; GFX10-NEXT: s_cselect_b64 vcc, -1, 0
2064 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2065 ; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2066 ; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2067 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2068 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2069 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[2:3]
2070 ; GFX10-NEXT: s_endpgm
2072 ; GFX11-LABEL: v_cndmask_abs_neg_f64:
2074 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
2075 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2076 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2077 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2078 ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
2079 ; GFX11-NEXT: s_clause 0x1
2080 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
2081 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2082 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2083 ; GFX11-NEXT: s_cmp_lg_u32 s2, 0
2084 ; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
2085 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2086 ; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
2087 ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
2088 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
2089 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
2090 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2091 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
2092 ; GFX11-NEXT: s_nop 0
2093 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2094 ; GFX11-NEXT: s_endpgm
2095 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
2096 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx
2097 %f = load double, ptr addrspace(1) %f.gep
2098 %f.abs = call double @llvm.fabs.f64(double %f)
2099 %f.neg = fneg double %f
2100 %setcc = icmp ne i32 %c, 0
2101 %select = select i1 %setcc, double %f.abs, double %f.neg
2102 store double %select, ptr addrspace(1) %out
2106 attributes #0 = { nounwind }
2107 attributes #1 = { nounwind readnone }